From 672b0b184019bf840e3003e90a1c662fcb4df79a Mon Sep 17 00:00:00 2001 From: Alexander Graf <17130992+aandergr@users.noreply.github.com> Date: Sat, 30 Jan 2021 16:54:35 +0100 Subject: [PATCH] Fix efficiency bug regarding re-downloading targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove unnecessary queries when a file in a target does not have to be re-downloaded. This significantly speeds up the download in cases where the majority of media is already present locally. Co-Authored-By: André Koch-Kramer --- instaloader/instaloader.py | 90 +++++++++++++++++++++++++++----------- instaloader/structures.py | 17 +++++-- 2 files changed, 78 insertions(+), 29 deletions(-) diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index 90ff125..f6dbd9f 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -551,6 +551,27 @@ class Instaloader: :return: True if something was downloaded, False otherwise, i.e. file was already there """ + def _already_downloaded(path: str) -> bool: + if not os.path.isfile(path): + return False + else: + self.context.log(path + ' exists', end=' ', flush=True) + return True + + def _all_already_downloaded(path_base, is_videos_enumerated) -> bool: + if '{filename}' in self.filename_pattern: + # full URL needed to evaluate actual filename, cannot determine at + # this point if all sidecar nodes were already downloaded. + return False + for idx, is_video in is_videos_enumerated: + if self.download_pictures and (not is_video or self.download_video_thumbnails): + if not _already_downloaded("{0}_{1}.jpg".format(path_base, idx)): + return False + if is_video and self.download_videos: + if not _already_downloaded("{0}_{1}.mp4".format(path_base, idx)): + return False + return True + dirname = _PostPathFormatter(post).format(self.dirname_pattern, target=target) filename_template = os.path.join(dirname, self.format_filename(post, target=target)) filename = self.__prepare_filename(filename_template, lambda: post.url) @@ -559,37 +580,45 @@ class Instaloader: downloaded = True if post.typename == 'GraphSidecar': if self.download_pictures or self.download_videos: - for edge_number, sidecar_node in enumerate( - post.get_sidecar_nodes(self.slide_start, self.slide_end), - start=post.mediacount if self.slide_start < 0 else self.slide_start + 1 + if not _all_already_downloaded( + filename_template, enumerate( + (post.get_is_videos()[i] + for i in range(self.slide_start % post.mediacount, self.slide_end % post.mediacount + 1)), + start=self.slide_start % post.mediacount + 1 + ) ): - if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails): - suffix = str(edge_number) + for edge_number, sidecar_node in enumerate( + post.get_sidecar_nodes(self.slide_start, self.slide_end), + start=self.slide_start % post.mediacount + 1 + ): + suffix = str(edge_number) # type: Optional[str] if '{filename}' in self.filename_pattern: - suffix = '' - # pylint:disable=cell-var-from-loop - filename = self.__prepare_filename(filename_template, lambda: sidecar_node.display_url) - # Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails) - downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url, - mtime=post.date_local, filename_suffix=suffix) - if sidecar_node.is_video and self.download_videos: - suffix = str(edge_number) - if '{filename}' in self.filename_pattern: - suffix = '' - # pylint:disable=cell-var-from-loop - filename = self.__prepare_filename(filename_template, lambda: sidecar_node.video_url) - # Download sidecar video if desired - downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url, - mtime=post.date_local, filename_suffix=suffix) + suffix = None + if self.download_pictures and (not sidecar_node.is_video or self.download_video_thumbnails): + # pylint:disable=cell-var-from-loop + filename = self.__prepare_filename(filename_template, lambda: sidecar_node.display_url) + # Download sidecar picture or video thumbnail (--no-pictures implies --no-video-thumbnails) + downloaded &= self.download_pic(filename=filename, url=sidecar_node.display_url, + mtime=post.date_local, filename_suffix=suffix) + if sidecar_node.is_video and self.download_videos: + # pylint:disable=cell-var-from-loop + filename = self.__prepare_filename(filename_template, lambda: sidecar_node.video_url) + # Download sidecar video if desired + downloaded &= self.download_pic(filename=filename, url=sidecar_node.video_url, + mtime=post.date_local, filename_suffix=suffix) + else: + downloaded = False elif post.typename == 'GraphImage': # Download picture if self.download_pictures: - downloaded = self.download_pic(filename=filename, url=post.url, mtime=post.date_local) + downloaded = (not _already_downloaded(filename + ".jpg") and + self.download_pic(filename=filename, url=post.url, mtime=post.date_local)) elif post.typename == 'GraphVideo': # Download video thumbnail (--no-pictures implies --no-video-thumbnails) if self.download_pictures and self.download_video_thumbnails: with self.context.error_catcher("Video thumbnail of {}".format(post)): - downloaded = self.download_pic(filename=filename, url=post.url, mtime=post.date_local) + downloaded = (not _already_downloaded(filename + ".jpg") and + self.download_pic(filename=filename, url=post.url, mtime=post.date_local)) else: self.context.error("Warning: {0} has unknown typename: {1}".format(post, post.typename)) @@ -600,7 +629,8 @@ class Instaloader: # Download video if desired if post.is_video and self.download_videos: - downloaded &= self.download_pic(filename=filename, url=post.video_url, mtime=post.date_local) + downloaded &= (not _already_downloaded(filename + ".mp4") and + self.download_pic(filename=filename, url=post.video_url, mtime=post.date_local)) # Download geotags if desired if self.download_geotags and post.location: @@ -693,17 +723,25 @@ class Instaloader: :return: True if something was downloaded, False otherwise, i.e. file was already there """ + def _already_downloaded(path: str) -> bool: + if not os.path.isfile(path): + return False + else: + self.context.log(path + ' exists', end=' ', flush=True) + return True + date_local = item.date_local dirname = _PostPathFormatter(item).format(self.dirname_pattern, target=target) filename_template = os.path.join(dirname, self.format_filename(item, target=target)) filename = self.__prepare_filename(filename_template, lambda: item.url) downloaded = False if not item.is_video or self.download_video_thumbnails is True: - url = item.url - downloaded = self.download_pic(filename=filename, url=url, mtime=date_local) + downloaded = (not _already_downloaded(filename + ".jpg") and + self.download_pic(filename=filename, url=item.url, mtime=date_local)) if item.is_video and self.download_videos is True: filename = self.__prepare_filename(filename_template, lambda: str(item.video_url)) - downloaded |= self.download_pic(filename=filename, url=item.video_url, mtime=date_local) + downloaded |= (not _already_downloaded(filename + ".mp4") and + self.download_pic(filename=filename, url=item.video_url, mtime=date_local)) # Save caption if desired metadata_string = _ArbitraryItemFormatter(item).format(self.storyitem_metadata_txt_pattern).strip() if metadata_string: diff --git a/instaloader/structures.py b/instaloader/structures.py index 1a64584..3604f65 100644 --- a/instaloader/structures.py +++ b/instaloader/structures.py @@ -272,6 +272,17 @@ class Post: return len(edges) return 1 + def get_is_videos(self) -> List[bool]: + """ + Return a list containing the ``is_video`` property for each media in the post. + + .. versionadded:: 4.7 + """ + if self.typename == 'GraphSidecar': + edges = self._field('edge_sidecar_to_children', 'edges') + return [edge['node']['is_video'] for edge in edges] + return [self.is_video] + def get_sidecar_nodes(self, start=0, end=-1) -> Iterator[PostSidecarNode]: """ Sidecar nodes of a Post with typename==GraphSidecar. @@ -281,13 +292,13 @@ class Post: """ if self.typename == 'GraphSidecar': edges = self._field('edge_sidecar_to_children', 'edges') - if any(edge['node']['is_video'] for edge in edges): - # video_url is only present in full metadata, issue #558. - edges = self._full_metadata['edge_sidecar_to_children']['edges'] if end < 0: end = len(edges)-1 if start < 0: start = len(edges)-1 + if any(self.get_is_videos()[start:(end+1)]): + # video_url is only present in full metadata, issue #558. + edges = self._full_metadata['edge_sidecar_to_children']['edges'] for idx, edge in enumerate(edges): if start <= idx <= end: node = edge['node']