From 9cdf679fc1efe1979210fb5a77ef4d25905cbc72 Mon Sep 17 00:00:00 2001 From: Eduardo Kalinowski Date: Sat, 24 Jul 2021 14:27:46 -0300 Subject: [PATCH] Fix interruped downloads with --latest-stamps (#1219) The most recent post is cached in NodeIterator (and saved to the disk), and its timestamp is used, instead of the timestamp instaloader was run. This way, even in later resuming runs the timestamp stored is the same that would have been stored if the first run. Fixes #1206. --- instaloader/instaloader.py | 25 +++++++++++++------------ instaloader/nodeiterator.py | 23 ++++++++++++++++++++--- instaloader/structures.py | 2 ++ 3 files changed, 35 insertions(+), 15 deletions(-) diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index 2ed91fe..e83a64f 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -1203,14 +1203,14 @@ class Instaloader: if latest_stamps is not None: last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username) posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped - scraped_timestamp = datetime.now().astimezone() - self.posts_download_loop(profile.get_tagged_posts(), + tagged_posts = profile.get_tagged_posts() + self.posts_download_loop(tagged_posts, target if target else (Path(_PostPathFormatter.sanitize_path(profile.username)) / _PostPathFormatter.sanitize_path(':tagged')), fast_update, post_filter, takewhile=posts_takewhile) - if latest_stamps is not None: - latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp) + if latest_stamps is not None and tagged_posts.first_item is not None: + latest_stamps.set_last_tagged_timestamp(profile.username, tagged_posts.first_item.date_local.astimezone()) def download_igtv(self, profile: Profile, fast_update: bool = False, post_filter: Optional[Callable[[Post], bool]] = None, @@ -1226,11 +1226,11 @@ class Instaloader: if latest_stamps is not None: last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username) posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped - scraped_timestamp = datetime.now().astimezone() - self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter, + igtv_posts = profile.get_igtv_posts() + self.posts_download_loop(igtv_posts, profile.username, fast_update, post_filter, total_count=profile.igtvcount, owner_profile=profile, takewhile=posts_takewhile) - if latest_stamps is not None: - latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp) + if latest_stamps is not None and igtv_posts.first_item is not None: + latest_stamps.set_last_igtv_timestamp(profile.username, igtv_posts.first_item.date_local.astimezone()) def _get_id_filename(self, profile_name: str) -> str: if ((format_string_contains_key(self.dirname_pattern, 'profile') or @@ -1424,12 +1424,13 @@ class Instaloader: # pylint:disable=cell-var-from-loop last_scraped = latest_stamps.get_last_post_timestamp(profile_name) posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped - scraped_timestamp = datetime.now().astimezone() - self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter, + posts_to_download = profile.get_posts() + self.posts_download_loop(posts_to_download, profile_name, fast_update, post_filter, total_count=profile.mediacount, owner_profile=profile, takewhile=posts_takewhile) - if latest_stamps is not None: - latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp) + if latest_stamps is not None and posts_to_download.first_item is not None: + latest_stamps.set_last_post_timestamp(profile_name, + posts_to_download.first_item.date_local.astimezone()) if stories and profiles: with self.context.error_catcher("Download stories"): diff --git a/instaloader/nodeiterator.py b/instaloader/nodeiterator.py index cb030b2..90d3937 100644 --- a/instaloader/nodeiterator.py +++ b/instaloader/nodeiterator.py @@ -17,7 +17,8 @@ FrozenNodeIterator = NamedTuple('FrozenNodeIterator', ('context_username', Optional[str]), ('total_index', int), ('best_before', Optional[float]), - ('remaining_data', Optional[Dict])]) + ('remaining_data', Optional[Dict]), + ('first_node', Optional[Dict])]) FrozenNodeIterator.query_hash.__doc__ = """The GraphQL ``query_hash`` parameter.""" FrozenNodeIterator.query_variables.__doc__ = """The GraphQL ``query_variables`` parameter.""" FrozenNodeIterator.query_referer.__doc__ = """The HTTP referer used for the GraphQL query.""" @@ -26,7 +27,7 @@ FrozenNodeIterator.total_index.__doc__ = """Number of items that have already be FrozenNodeIterator.best_before.__doc__ = """Date when parts of the stored nodes might have expired.""" FrozenNodeIterator.remaining_data.__doc__ = \ """The already-retrieved, yet-unprocessed ``edges`` and the ``page_info`` at time of freezing.""" - +FrozenNodeIterator.first_node.__doc__ = """Node data of the first item, if an item has been produced.""" T = TypeVar('T') @@ -89,6 +90,7 @@ class NodeIterator(Iterator[T]): self._best_before = datetime.now() + NodeIterator._shelf_life else: self._data = self._query() + self._first_node: Optional[Dict] = None def _query(self, after: Optional[str] = None) -> Dict: pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any] @@ -125,7 +127,10 @@ class NodeIterator(Iterator[T]): except KeyboardInterrupt: self._page_index, self._total_index = page_index, total_index raise - return self._node_wrapper(node) + item = self._node_wrapper(node) + if self._first_node is None: + self._first_node = node + return item if self._data['page_info']['has_next_page']: query_response = self._query(self._data['page_info']['end_cursor']) page_index, data = self._page_index, self._data @@ -157,6 +162,15 @@ class NodeIterator(Iterator[T]): ).encode()) return base64.urlsafe_b64encode(magic_hash.digest()).decode() + @property + def first_item(self) -> Optional[T]: + """ + If this iterator has produced any items, returns the first item produced. + + .. versionadded:: 4.8 + """ + return self._node_wrapper(self._first_node) if self._first_node is not None else None + def freeze(self) -> FrozenNodeIterator: """Freeze the iterator for later resuming.""" remaining_data = None @@ -171,6 +185,7 @@ class NodeIterator(Iterator[T]): total_index=max(self.total_index - 1, 0), best_before=self._best_before.timestamp() if self._best_before else None, remaining_data=remaining_data, + first_node=self._first_node, ) def thaw(self, frozen: FrozenNodeIterator) -> None: @@ -197,6 +212,8 @@ class NodeIterator(Iterator[T]): self._total_index = frozen.total_index self._best_before = datetime.fromtimestamp(frozen.best_before) self._data = frozen.remaining_data + if frozen.first_node is not None: + self._first_node = frozen.first_node @contextmanager diff --git a/instaloader/structures.py b/instaloader/structures.py index 7100c1f..de21e5c 100644 --- a/instaloader/structures.py +++ b/instaloader/structures.py @@ -1643,6 +1643,8 @@ def load_structure(context: InstaloaderContext, json_structure: dict) -> JsonExp elif node_type == "Hashtag": return Hashtag(context, json_structure['node']) elif node_type == "FrozenNodeIterator": + if not 'first_node' in json_structure['node']: + json_structure['node']['first_node'] = None return FrozenNodeIterator(**json_structure['node']) elif 'shortcode' in json_structure: # Post JSON created with Instaloader v3