From 3bfc82f7c0d7c85585786cc8e2556a20da9ca112 Mon Sep 17 00:00:00 2001
From: Adam Howard <medavox2+github@gmail.com>
Date: Wed, 11 Nov 2015 13:07:09 +0000
Subject: [PATCH] Refactoring YoutubeExtractor: -replaced single use of
 terrible_unescape_workaround_fuck(String) with call to
 URLDecoder.decode(String, String) * tested new regex implementation of
 YoutubeExtractor.getVideoId(String) - deleted old HashMap-based
 implementation of YoutubeExtractor.getVideoId(String) * Miscellaneous typo
 corrections * replaced direct page-scraping extraction of video publication
 date in YoutubeExtractor.getVideoInfo(String) with jsoup-based scrape of
 <meta> tag field in YYYY-MM-DD format *similarly,  replaced direct
 page-scraping extraction of view count with <meta> tag field.

Both <meta> tag fields still need to be formatted locale-specifically
---
 NewPipe.iml                                   | 19 ----
 .../newpipe/youtube/YoutubeExtractor.java     | 87 ++++---------------
 2 files changed, 19 insertions(+), 87 deletions(-)
 delete mode 100644 NewPipe.iml
diff --git a/NewPipe.iml b/NewPipe.iml
deleted file mode 100644
index 8d3b0cd6c..000000000
--- a/NewPipe.iml
+++ /dev/null
@@ -1,19 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module external.linked.project.id="NewPipe" external.linked.project.path="$MODULE_DIR$" external.root.project.path="$MODULE_DIR$" external.system.id="GRADLE" external.system.module.group="" external.system.module.version="unspecified" type="JAVA_MODULE" version="4">
-  <component name="FacetManager">
-    <facet type="java-gradle" name="Java-Gradle">
-      <configuration>
-        <option name="BUILD_FOLDER_PATH" value="$MODULE_DIR$/build" />
-        <option name="BUILDABLE" value="false" />
-      </configuration>
-    </facet>
-  </component>
-  <component name="NewModuleRootManager" LANGUAGE_LEVEL="JDK_1_7" inherit-compiler-output="true">
-    <exclude-output />
-    <content url="file://$MODULE_DIR$">
-      <excludeFolder url="file://$MODULE_DIR$/.gradle" />
-    </content>
-    <orderEntry type="inheritedJdk" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/app/src/main/java/org/schabi/newpipe/youtube/YoutubeExtractor.java b/app/src/main/java/org/schabi/newpipe/youtube/YoutubeExtractor.java
index a9cf3807a..48f87fa9a 100644
--- a/app/src/main/java/org/schabi/newpipe/youtube/YoutubeExtractor.java
+++ b/app/src/main/java/org/schabi/newpipe/youtube/YoutubeExtractor.java
@@ -20,6 +20,7 @@ import org.xmlpull.v1.XmlPullParser;
 
 import java.io.StringReader;
 import java.net.URI;
+import java.net.URLDecoder;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Vector;
@@ -113,7 +114,7 @@ public class YoutubeExtractor implements Extractor {
             JSONObject jsonObj = new JSONObject(jsonString);
 
             //----------------------------------
-            // load an parse description code
+            // load and parse description code
             //----------------------------------
             if (decryptionCode.isEmpty()) {
                 JSONObject ytAssets = jsonObj.getJSONObject("assets");
@@ -149,43 +150,7 @@ public class YoutubeExtractor implements Extractor {
         id = mat.group(1);
         return (id == null ? "" : id);
     }
-/*
-    @Override
-    public String getVideoId(String videoUrl) {
-        try {
-            URI uri = new URI(videoUrl);
-            if(uri.getHost().contains("youtube")) {
-                String query = uri.getFragment();
-                if(query == null) {
-                    query = uri.getQuery();
-                } else {
-                    query = query.replace("/watch?", "");
-                }
-                String queryElements[] = query.split("&");
-                Map<String, String> queryArguments = new HashMap<>();
-                for (String e : queryElements) {
-                    String[] s = e.split("=");
-                    queryArguments.put(s[0], s[1]);
-                }
-                return queryArguments.get("v");
-            } else if(uri.getHost().contains("youtu.be")) {
-                // uri.getRawPath() does somehow not return the last character.
-                // so we do a workaround instead.
-                //return uri.getRawPath();
-                String url[] = videoUrl.split("/");
-                return url[url.length-1];
-            } else {
-                Log.e(TAG, "Error could not parse url: " + videoUrl);
 
-            }
-        }  catch(Exception e) {
-            Log.e(TAG, "Error could not parse url: " + videoUrl);
-            e.printStackTrace();
-            return "";
-        }
-        return null;
-    }
-*/
     @Override
     public String getVideoUrl(String videoId) {
         return "https://www.youtube.com/watch?v=" + videoId;
@@ -198,18 +163,17 @@ public class YoutubeExtractor implements Extractor {
 
         Document doc = Jsoup.parse(site, siteUrl);
 
-        videoInfo.id = matchGroup1("v=([0-9a-zA-Z]*)", siteUrl);
+        videoInfo.id = matchGroup1("v=([0-9a-zA-Z]{10,})", siteUrl);
 
         videoInfo.age_limit = 0;
         videoInfo.webpage_url = siteUrl;
 
-
         initService(site);
 
         //-------------------------------------
         // extracting form player args
         //-------------------------------------
-        JSONObject playerArgs = null;
+        JSONObject playerArgs;
         {
             try {
                 String jsonString = matchGroup1("ytplayer.config\\s*=\\s*(\\{.*?\\});", site);
@@ -221,6 +185,8 @@ public class YoutubeExtractor implements Extractor {
                 // If we fail in this part the video is most likely not available.
                 // Determining why is done later.
                 videoInfo.videoAvailableStatus = VideoInfo.VIDEO_UNAVAILABLE;
+                //exit early, since we can't extract other args
+                return videoInfo;
             }
         }
 
@@ -244,7 +210,7 @@ public class YoutubeExtractor implements Extractor {
 
             videoInfo.uploader = playerArgs.getString("author");
             videoInfo.title = playerArgs.getString("title");
-            //first attempt gating a small image version
+            //first attempt getting a small image version
             //in the html extracting part we try to get a thumbnail with a higher resolution
             videoInfo.thumbnail_url = playerArgs.getString("thumbnail_url");
             videoInfo.duration = playerArgs.getInt("length_seconds");
@@ -263,7 +229,7 @@ public class YoutubeExtractor implements Extractor {
                 }
 
                 int itag = Integer.parseInt(tags.get("itag"));
-                String streamUrl = terrible_unescape_workaround_fuck(tags.get("url"));
+                String streamUrl = URLDecoder.decode(tags.get("url"), "UTF-8");
 
                 // if video has a signature: decrypt it and add it to the url
                 if(tags.get("s") != null) {
@@ -301,16 +267,19 @@ public class YoutubeExtractor implements Extractor {
             videoInfo.thumbnail_url = doc.select("link[itemprop=\"thumbnailUrl\"]").first()
                     .attr("abs:href");
         } catch(Exception e) {
-            Log.i(TAG, "Could not find high res Thumbnail. Use low res instead");
+            Log.i(TAG, "Could not find high res Thumbnail. Using low res instead");
         }
 
         // upload date
-        videoInfo.upload_date = doc.select("strong[class=\"watch-time-text\"").first()
-                .text();
+        //videoInfo.upload_date = doc.select("strong[class=\"watch-time-text\"").first().text();
+        videoInfo.upload_date = doc.select("meta[itemprop=datePublished]").attr("content");
 
         // Extracting the date itself from header
-        videoInfo.upload_date =
-                matchGroup1("([0-9]{2}\\.[0-9]{2}\\.[0-9]{4})", videoInfo.upload_date);
+        //videoInfo.upload_date =
+        //        matchGroup1("([0-9]{2}\\.[0-9]{2}\\.[0-9]{4})", videoInfo.upload_date);
+
+        //TODO: Format date locale-specifically
+
 
         // description
         videoInfo.description = doc.select("p[id=\"eow-description\"]").first()
@@ -322,7 +291,6 @@ public class YoutubeExtractor implements Extractor {
                     .getAllElements().select("button")
                     .select("span").get(0).text();
 
-
             // dislikes
             videoInfo.dislike_count = doc.select("span[class=\"like-button-renderer \"]").first()
                     .getAllElements().select("button")
@@ -339,23 +307,18 @@ public class YoutubeExtractor implements Extractor {
                 .attr("abs:data-thumb");
 
         // view count
-        videoInfo.view_count = doc.select("div[class=\"watch-view-count\"]").first().text();
-
-        // Extracting the number of views from header
-        videoInfo.view_count = matchGroup1("([0-9,]*$)", videoInfo.view_count);
+        videoInfo.view_count = doc.select("meta[itemprop=interactionCount]").attr("content");
 
         // next video
         videoInfo.nextVideo = extractVideoInfoItem(doc.select("div[class=\"watch-sidebar-section\"]").first()
                 .select("li").first());
 
-        int i = 0;
         // related videos
         Vector<VideoInfoItem> relatedVideos = new Vector<>();
         for(Element li : doc.select("ul[id=\"watch-related\"]").first().children()) {
             // first check if we have a playlist. If so leave them out
             if(li.select("a[class*=\"content-link\"]").first() != null) {
                 relatedVideos.add(extractVideoInfoItem(li));
-                i++;
             }
         }
         videoInfo.relatedVideos = relatedVideos.toArray(new VideoInfoItem[relatedVideos.size()]);
@@ -436,6 +399,7 @@ public class YoutubeExtractor implements Extractor {
             e.printStackTrace();
         }
 
+        //todo: check NullPointerException causing
         info.title = li.select("span[class=\"title\"]").first().text();
         info.view_count = li.select("span[class*=\"view-count\"]").first().text();
         info.uploader = li.select("span[class=\"g-hovercard\"]").first().text();
@@ -455,19 +419,6 @@ public class YoutubeExtractor implements Extractor {
         return info;
     }
 
-    private String terrible_unescape_workaround_fuck(String shit) {
-        String[] splitAtEscape = shit.split("%");
-        String retval = "";
-        retval += splitAtEscape[0];
-        for(int i = 1; i < splitAtEscape.length; i++) {
-            String escNum = splitAtEscape[i].substring(0, 2);
-            char c = (char) Integer.parseInt(escNum,16);
-            retval += c;
-            retval += splitAtEscape[i].substring(2);
-        }
-        return retval;
-    }
-
     private String loadDecryptionCode(String playerUrl) {
         String playerCode = Downloader.download(playerUrl);
         String decryptionFuncName = "";
@@ -523,7 +474,7 @@ public class YoutubeExtractor implements Extractor {
             return mat.group(1);
         }
         else {
-            Log.e(TAG, "failed to find pattern \""+pattern+"\"inside of \""+input+"\"");
+            Log.e(TAG, "failed to find pattern \""+pattern+"\" inside of \""+input+"\"");
             new Exception("failed to find pattern \""+pattern+"\"").printStackTrace();
             return "";
         }