1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-11-04 17:32:30 +01:00

Make downloads using --latest-stamps resumable (#1193)

Using itertools.takewhile() on a NodeIterator returns a plain Iterator,
and so it's not resumable.

The strategy has been altered to pass an extra argument to
posts_download_loop, a lambda that is evaluated for each post, and
causes the loop to stop when it returns false.
This commit is contained in:
Eduardo Kalinowski 2021-06-16 17:19:16 -03:00 committed by GitHub
parent ac5d6e312a
commit 455a757159
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -11,7 +11,6 @@ from contextlib import contextmanager, suppress
from datetime import datetime, timezone from datetime import datetime, timezone
from functools import wraps from functools import wraps
from io import BytesIO from io import BytesIO
from itertools import takewhile
from pathlib import Path from pathlib import Path
from typing import Any, Callable, IO, Iterator, List, Optional, Set, Union, cast from typing import Any, Callable, IO, Iterator, List, Optional, Set, Union, cast
from urllib.parse import urlparse from urllib.parse import urlparse
@ -803,14 +802,13 @@ class Instaloader:
self.context.log(msg) self.context.log(msg)
totalcount = user_story.itemcount totalcount = user_story.itemcount
count = 1 count = 1
stories_to_download = user_story.get_items()
if latest_stamps is not None: if latest_stamps is not None:
# pylint:disable=cell-var-from-loop # pylint:disable=cell-var-from-loop
last_scraped = latest_stamps.get_last_story_timestamp(name) last_scraped = latest_stamps.get_last_story_timestamp(name)
stories_to_download = takewhile(lambda s: s.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
stories_to_download)
scraped_timestamp = datetime.now().astimezone() scraped_timestamp = datetime.now().astimezone()
for item in stories_to_download: for item in user_story.get_items():
if last_scraped is not None and item.date_utc.replace(tzinfo=timezone.utc) <= last_scraped:
break
if storyitem_filter is not None and not storyitem_filter(item): if storyitem_filter is not None and not storyitem_filter(item):
self.context.log("<{} skipped>".format(item), flush=True) self.context.log("<{} skipped>".format(item), flush=True)
continue continue
@ -932,7 +930,8 @@ class Instaloader:
post_filter: Optional[Callable[[Post], bool]] = None, post_filter: Optional[Callable[[Post], bool]] = None,
max_count: Optional[int] = None, max_count: Optional[int] = None,
total_count: Optional[int] = None, total_count: Optional[int] = None,
owner_profile: Optional[Profile] = None) -> None: owner_profile: Optional[Profile] = None,
takewhile: Optional[Callable[[Post], bool]] = None) -> None:
""" """
Download the Posts returned by given Post Iterator. Download the Posts returned by given Post Iterator.
@ -941,6 +940,9 @@ class Instaloader:
.. versionchanged:: 4.5 .. versionchanged:: 4.5
Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`. Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`.
.. versionchanged:: 4.8
Add `takewhile` parameter.
:param posts: Post Iterator to loop through. :param posts: Post Iterator to loop through.
:param target: Target name. :param target: Target name.
:param fast_update: :option:`--fast-update`. :param fast_update: :option:`--fast-update`.
@ -948,12 +950,15 @@ class Instaloader:
:param max_count: Maximum count of Posts to download (:option:`--count`). :param max_count: Maximum count of Posts to download (:option:`--count`).
:param total_count: Total number of posts returned by given iterator. :param total_count: Total number of posts returned by given iterator.
:param owner_profile: Associated profile, if any. :param owner_profile: Associated profile, if any.
:param takewhile: Expression evaluated for each post. Once it returns false, downloading stops.
""" """
displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count
else total_count) else total_count)
sanitized_target = target sanitized_target = target
if isinstance(target, str): if isinstance(target, str):
sanitized_target = _PostPathFormatter.sanitize_path(target) sanitized_target = _PostPathFormatter.sanitize_path(target)
if takewhile is None:
takewhile = lambda _: True
with resumable_iteration( with resumable_iteration(
context=self.context, context=self.context,
iterator=posts, iterator=posts,
@ -966,7 +971,7 @@ class Instaloader:
enabled=self.resume_prefix is not None enabled=self.resume_prefix is not None
) as (is_resuming, start_index): ) as (is_resuming, start_index):
for number, post in enumerate(posts, start=start_index + 1): for number, post in enumerate(posts, start=start_index + 1):
if max_count is not None and number > max_count: if (max_count is not None and number > max_count) or not takewhile(post):
break break
if displayed_count is not None: if displayed_count is not None:
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number, displayed_count, self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number, displayed_count,
@ -1194,17 +1199,16 @@ class Instaloader:
.. versionchanged:: 4.8 .. versionchanged:: 4.8
Add `latest_stamps` parameter.""" Add `latest_stamps` parameter."""
self.context.log("Retrieving tagged posts for profile {}.".format(profile.username)) self.context.log("Retrieving tagged posts for profile {}.".format(profile.username))
posts_to_download: Iterator[Post] = profile.get_tagged_posts() posts_takewhile: Optional[Callable[[Post], bool]] = None
if latest_stamps is not None: if latest_stamps is not None:
last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username) last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username)
posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped, posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
posts_to_download)
scraped_timestamp = datetime.now().astimezone() scraped_timestamp = datetime.now().astimezone()
self.posts_download_loop(posts_to_download, self.posts_download_loop(profile.get_tagged_posts(),
target if target target if target
else (Path(_PostPathFormatter.sanitize_path(profile.username)) / else (Path(_PostPathFormatter.sanitize_path(profile.username)) /
_PostPathFormatter.sanitize_path(':tagged')), _PostPathFormatter.sanitize_path(':tagged')),
fast_update, post_filter) fast_update, post_filter, takewhile=posts_takewhile)
if latest_stamps is not None: if latest_stamps is not None:
latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp) latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp)
@ -1218,14 +1222,13 @@ class Instaloader:
.. versionchanged:: 4.8 .. versionchanged:: 4.8
Add `latest_stamps` parameter.""" Add `latest_stamps` parameter."""
self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username)) self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username))
posts_to_download: Iterator[Post] = profile.get_igtv_posts() posts_takewhile: Optional[Callable[[Post], bool]] = None
if latest_stamps is not None: if latest_stamps is not None:
last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username) last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username)
posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped, posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
posts_to_download)
scraped_timestamp = datetime.now().astimezone() scraped_timestamp = datetime.now().astimezone()
self.posts_download_loop(posts_to_download, profile.username, fast_update, post_filter, self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter,
total_count=profile.igtvcount, owner_profile=profile) total_count=profile.igtvcount, owner_profile=profile, takewhile=posts_takewhile)
if latest_stamps is not None: if latest_stamps is not None:
latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp) latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp)
@ -1416,15 +1419,15 @@ class Instaloader:
# Iterate over pictures and download them # Iterate over pictures and download them
if posts: if posts:
self.context.log("Retrieving posts from profile {}.".format(profile_name)) self.context.log("Retrieving posts from profile {}.".format(profile_name))
posts_to_download: Iterator[Post] = profile.get_posts() posts_takewhile: Optional[Callable[[Post], bool]] = None
if latest_stamps is not None: if latest_stamps is not None:
# pylint:disable=cell-var-from-loop # pylint:disable=cell-var-from-loop
last_scraped = latest_stamps.get_last_post_timestamp(profile_name) last_scraped = latest_stamps.get_last_post_timestamp(profile_name)
posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped, posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
posts_to_download)
scraped_timestamp = datetime.now().astimezone() scraped_timestamp = datetime.now().astimezone()
self.posts_download_loop(posts_to_download, profile_name, fast_update, post_filter, self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
total_count=profile.mediacount, owner_profile=profile) total_count=profile.mediacount, owner_profile=profile,
takewhile=posts_takewhile)
if latest_stamps is not None: if latest_stamps is not None:
latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp) latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp)