Make downloads using --latest-stamps resumable (#1193)

Using itertools.takewhile() on a NodeIterator returns a plain Iterator, and so it's not resumable. The strategy has been altered to pass an extra argument to posts_download_loop, a lambda that is evaluated for each post, and causes the loop to stop when it returns false.
2024-11-04 17:32:30 +01:00 · 2021-06-16 17:19:16 -03:00 · 2021-06-16 17:19:16 -03:00 · 455a757159
commit 455a757159
parent ac5d6e312a
1 changed files with 25 additions and 22 deletions
--- a/instaloader/instaloader.py
+++ b/instaloader/instaloader.py
@ -11,7 +11,6 @@ from contextlib import contextmanager, suppress
 from datetime import datetime, timezone
 from functools import wraps
 from io import BytesIO
 from itertools import takewhile
 from pathlib import Path
 from typing import Any, Callable, IO, Iterator, List, Optional, Set, Union, cast
 from urllib.parse import urlparse
@ -803,14 +802,13 @@ class Instaloader:
            self.context.log(msg)
            totalcount = user_story.itemcount
            count = 1
            stories_to_download = user_story.get_items()
            if latest_stamps is not None:
                # pylint:disable=cell-var-from-loop
                last_scraped = latest_stamps.get_last_story_timestamp(name)
                stories_to_download = takewhile(lambda s: s.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
                                                stories_to_download)
                scraped_timestamp = datetime.now().astimezone()
-            for item in stories_to_download:
+            for item in user_story.get_items():
                if last_scraped is not None and item.date_utc.replace(tzinfo=timezone.utc) <= last_scraped:
                    break
                if storyitem_filter is not None and not storyitem_filter(item):
                    self.context.log("<{} skipped>".format(item), flush=True)
                    continue
@ -932,7 +930,8 @@ class Instaloader:
                            post_filter: Optional[Callable[[Post], bool]] = None,
                            max_count: Optional[int] = None,
                            total_count: Optional[int] = None,
-                            owner_profile: Optional[Profile] = None) -> None:
+                            owner_profile: Optional[Profile] = None,
                            takewhile: Optional[Callable[[Post], bool]] = None) -> None:
        """
        Download the Posts returned by given Post Iterator.
@ -941,6 +940,9 @@ class Instaloader:
        .. versionchanged:: 4.5
           Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`.
        .. versionchanged:: 4.8
           Add `takewhile` parameter.
        :param posts: Post Iterator to loop through.
        :param target: Target name.
        :param fast_update: :option:`--fast-update`.
@ -948,12 +950,15 @@ class Instaloader:
        :param max_count: Maximum count of Posts to download (:option:`--count`).
        :param total_count: Total number of posts returned by given iterator.
        :param owner_profile: Associated profile, if any.
        :param takewhile: Expression evaluated for each post. Once it returns false, downloading stops.
        """
        displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count
                           else total_count)
        sanitized_target = target
        if isinstance(target, str):
            sanitized_target = _PostPathFormatter.sanitize_path(target)
        if takewhile is None:
            takewhile = lambda _: True
        with resumable_iteration(
                context=self.context,
                iterator=posts,
@ -966,7 +971,7 @@ class Instaloader:
                enabled=self.resume_prefix is not None
        ) as (is_resuming, start_index):
            for number, post in enumerate(posts, start=start_index + 1):
-                if max_count is not None and number > max_count:
+                if (max_count is not None and number > max_count) or not takewhile(post):
                    break
                if displayed_count is not None:
                    self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number, displayed_count,
@ -1194,17 +1199,16 @@ class Instaloader:
        .. versionchanged:: 4.8
           Add `latest_stamps` parameter."""
        self.context.log("Retrieving tagged posts for profile {}.".format(profile.username))
-        posts_to_download: Iterator[Post] = profile.get_tagged_posts()
+        posts_takewhile: Optional[Callable[[Post], bool]] = None
        if latest_stamps is not None:
            last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username)
-            posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
+            posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
                                          posts_to_download)
            scraped_timestamp = datetime.now().astimezone()
-        self.posts_download_loop(posts_to_download,
+        self.posts_download_loop(profile.get_tagged_posts(),
                                 target if target
                                 else (Path(_PostPathFormatter.sanitize_path(profile.username)) /
                                       _PostPathFormatter.sanitize_path(':tagged')),
-                                 fast_update, post_filter)
+                                 fast_update, post_filter, takewhile=posts_takewhile)
        if latest_stamps is not None:
            latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp)
@ -1218,14 +1222,13 @@ class Instaloader:
        .. versionchanged:: 4.8
           Add `latest_stamps` parameter."""
        self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username))
-        posts_to_download: Iterator[Post] = profile.get_igtv_posts()
+        posts_takewhile: Optional[Callable[[Post], bool]] = None
        if latest_stamps is not None:
            last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username)
-            posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
+            posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
                                          posts_to_download)
            scraped_timestamp = datetime.now().astimezone()
-        self.posts_download_loop(posts_to_download, profile.username, fast_update, post_filter,
+        self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter,
-                                 total_count=profile.igtvcount, owner_profile=profile)
+                                 total_count=profile.igtvcount, owner_profile=profile, takewhile=posts_takewhile)
        if latest_stamps is not None:
            latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp)
@ -1416,15 +1419,15 @@ class Instaloader:
                # Iterate over pictures and download them
                if posts:
                    self.context.log("Retrieving posts from profile {}.".format(profile_name))
-                    posts_to_download: Iterator[Post] = profile.get_posts()
+                    posts_takewhile: Optional[Callable[[Post], bool]] = None
                    if latest_stamps is not None:
                        # pylint:disable=cell-var-from-loop
                        last_scraped = latest_stamps.get_last_post_timestamp(profile_name)
-                        posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
+                        posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
                                                      posts_to_download)
                        scraped_timestamp = datetime.now().astimezone()
-                    self.posts_download_loop(posts_to_download, profile_name, fast_update, post_filter,
+                    self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
-                                             total_count=profile.mediacount, owner_profile=profile)
+                                             total_count=profile.mediacount, owner_profile=profile,
                                             takewhile=posts_takewhile)
                    if latest_stamps is not None:
                        latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp)