mirror of
https://github.com/instaloader/instaloader.git
synced 2024-11-04 17:32:30 +01:00
Make downloads using --latest-stamps resumable (#1193)
Using itertools.takewhile() on a NodeIterator returns a plain Iterator, and so it's not resumable. The strategy has been altered to pass an extra argument to posts_download_loop, a lambda that is evaluated for each post, and causes the loop to stop when it returns false.
This commit is contained in:
parent
ac5d6e312a
commit
455a757159
@ -11,7 +11,6 @@ from contextlib import contextmanager, suppress
|
|||||||
from datetime import datetime, timezone
|
from datetime import datetime, timezone
|
||||||
from functools import wraps
|
from functools import wraps
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from itertools import takewhile
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, IO, Iterator, List, Optional, Set, Union, cast
|
from typing import Any, Callable, IO, Iterator, List, Optional, Set, Union, cast
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
@ -803,14 +802,13 @@ class Instaloader:
|
|||||||
self.context.log(msg)
|
self.context.log(msg)
|
||||||
totalcount = user_story.itemcount
|
totalcount = user_story.itemcount
|
||||||
count = 1
|
count = 1
|
||||||
stories_to_download = user_story.get_items()
|
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
# pylint:disable=cell-var-from-loop
|
# pylint:disable=cell-var-from-loop
|
||||||
last_scraped = latest_stamps.get_last_story_timestamp(name)
|
last_scraped = latest_stamps.get_last_story_timestamp(name)
|
||||||
stories_to_download = takewhile(lambda s: s.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
|
|
||||||
stories_to_download)
|
|
||||||
scraped_timestamp = datetime.now().astimezone()
|
scraped_timestamp = datetime.now().astimezone()
|
||||||
for item in stories_to_download:
|
for item in user_story.get_items():
|
||||||
|
if last_scraped is not None and item.date_utc.replace(tzinfo=timezone.utc) <= last_scraped:
|
||||||
|
break
|
||||||
if storyitem_filter is not None and not storyitem_filter(item):
|
if storyitem_filter is not None and not storyitem_filter(item):
|
||||||
self.context.log("<{} skipped>".format(item), flush=True)
|
self.context.log("<{} skipped>".format(item), flush=True)
|
||||||
continue
|
continue
|
||||||
@ -932,7 +930,8 @@ class Instaloader:
|
|||||||
post_filter: Optional[Callable[[Post], bool]] = None,
|
post_filter: Optional[Callable[[Post], bool]] = None,
|
||||||
max_count: Optional[int] = None,
|
max_count: Optional[int] = None,
|
||||||
total_count: Optional[int] = None,
|
total_count: Optional[int] = None,
|
||||||
owner_profile: Optional[Profile] = None) -> None:
|
owner_profile: Optional[Profile] = None,
|
||||||
|
takewhile: Optional[Callable[[Post], bool]] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Download the Posts returned by given Post Iterator.
|
Download the Posts returned by given Post Iterator.
|
||||||
|
|
||||||
@ -941,6 +940,9 @@ class Instaloader:
|
|||||||
.. versionchanged:: 4.5
|
.. versionchanged:: 4.5
|
||||||
Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`.
|
Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`.
|
||||||
|
|
||||||
|
.. versionchanged:: 4.8
|
||||||
|
Add `takewhile` parameter.
|
||||||
|
|
||||||
:param posts: Post Iterator to loop through.
|
:param posts: Post Iterator to loop through.
|
||||||
:param target: Target name.
|
:param target: Target name.
|
||||||
:param fast_update: :option:`--fast-update`.
|
:param fast_update: :option:`--fast-update`.
|
||||||
@ -948,12 +950,15 @@ class Instaloader:
|
|||||||
:param max_count: Maximum count of Posts to download (:option:`--count`).
|
:param max_count: Maximum count of Posts to download (:option:`--count`).
|
||||||
:param total_count: Total number of posts returned by given iterator.
|
:param total_count: Total number of posts returned by given iterator.
|
||||||
:param owner_profile: Associated profile, if any.
|
:param owner_profile: Associated profile, if any.
|
||||||
|
:param takewhile: Expression evaluated for each post. Once it returns false, downloading stops.
|
||||||
"""
|
"""
|
||||||
displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count
|
displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count
|
||||||
else total_count)
|
else total_count)
|
||||||
sanitized_target = target
|
sanitized_target = target
|
||||||
if isinstance(target, str):
|
if isinstance(target, str):
|
||||||
sanitized_target = _PostPathFormatter.sanitize_path(target)
|
sanitized_target = _PostPathFormatter.sanitize_path(target)
|
||||||
|
if takewhile is None:
|
||||||
|
takewhile = lambda _: True
|
||||||
with resumable_iteration(
|
with resumable_iteration(
|
||||||
context=self.context,
|
context=self.context,
|
||||||
iterator=posts,
|
iterator=posts,
|
||||||
@ -966,7 +971,7 @@ class Instaloader:
|
|||||||
enabled=self.resume_prefix is not None
|
enabled=self.resume_prefix is not None
|
||||||
) as (is_resuming, start_index):
|
) as (is_resuming, start_index):
|
||||||
for number, post in enumerate(posts, start=start_index + 1):
|
for number, post in enumerate(posts, start=start_index + 1):
|
||||||
if max_count is not None and number > max_count:
|
if (max_count is not None and number > max_count) or not takewhile(post):
|
||||||
break
|
break
|
||||||
if displayed_count is not None:
|
if displayed_count is not None:
|
||||||
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number, displayed_count,
|
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number, displayed_count,
|
||||||
@ -1194,17 +1199,16 @@ class Instaloader:
|
|||||||
.. versionchanged:: 4.8
|
.. versionchanged:: 4.8
|
||||||
Add `latest_stamps` parameter."""
|
Add `latest_stamps` parameter."""
|
||||||
self.context.log("Retrieving tagged posts for profile {}.".format(profile.username))
|
self.context.log("Retrieving tagged posts for profile {}.".format(profile.username))
|
||||||
posts_to_download: Iterator[Post] = profile.get_tagged_posts()
|
posts_takewhile: Optional[Callable[[Post], bool]] = None
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username)
|
last_scraped = latest_stamps.get_last_tagged_timestamp(profile.username)
|
||||||
posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
|
posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
|
||||||
posts_to_download)
|
|
||||||
scraped_timestamp = datetime.now().astimezone()
|
scraped_timestamp = datetime.now().astimezone()
|
||||||
self.posts_download_loop(posts_to_download,
|
self.posts_download_loop(profile.get_tagged_posts(),
|
||||||
target if target
|
target if target
|
||||||
else (Path(_PostPathFormatter.sanitize_path(profile.username)) /
|
else (Path(_PostPathFormatter.sanitize_path(profile.username)) /
|
||||||
_PostPathFormatter.sanitize_path(':tagged')),
|
_PostPathFormatter.sanitize_path(':tagged')),
|
||||||
fast_update, post_filter)
|
fast_update, post_filter, takewhile=posts_takewhile)
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp)
|
latest_stamps.set_last_tagged_timestamp(profile.username, scraped_timestamp)
|
||||||
|
|
||||||
@ -1218,14 +1222,13 @@ class Instaloader:
|
|||||||
.. versionchanged:: 4.8
|
.. versionchanged:: 4.8
|
||||||
Add `latest_stamps` parameter."""
|
Add `latest_stamps` parameter."""
|
||||||
self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username))
|
self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username))
|
||||||
posts_to_download: Iterator[Post] = profile.get_igtv_posts()
|
posts_takewhile: Optional[Callable[[Post], bool]] = None
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username)
|
last_scraped = latest_stamps.get_last_igtv_timestamp(profile.username)
|
||||||
posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
|
posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
|
||||||
posts_to_download)
|
|
||||||
scraped_timestamp = datetime.now().astimezone()
|
scraped_timestamp = datetime.now().astimezone()
|
||||||
self.posts_download_loop(posts_to_download, profile.username, fast_update, post_filter,
|
self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter,
|
||||||
total_count=profile.igtvcount, owner_profile=profile)
|
total_count=profile.igtvcount, owner_profile=profile, takewhile=posts_takewhile)
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp)
|
latest_stamps.set_last_igtv_timestamp(profile.username, scraped_timestamp)
|
||||||
|
|
||||||
@ -1416,15 +1419,15 @@ class Instaloader:
|
|||||||
# Iterate over pictures and download them
|
# Iterate over pictures and download them
|
||||||
if posts:
|
if posts:
|
||||||
self.context.log("Retrieving posts from profile {}.".format(profile_name))
|
self.context.log("Retrieving posts from profile {}.".format(profile_name))
|
||||||
posts_to_download: Iterator[Post] = profile.get_posts()
|
posts_takewhile: Optional[Callable[[Post], bool]] = None
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
# pylint:disable=cell-var-from-loop
|
# pylint:disable=cell-var-from-loop
|
||||||
last_scraped = latest_stamps.get_last_post_timestamp(profile_name)
|
last_scraped = latest_stamps.get_last_post_timestamp(profile_name)
|
||||||
posts_to_download = takewhile(lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped,
|
posts_takewhile = lambda p: p.date_utc.replace(tzinfo=timezone.utc) > last_scraped
|
||||||
posts_to_download)
|
|
||||||
scraped_timestamp = datetime.now().astimezone()
|
scraped_timestamp = datetime.now().astimezone()
|
||||||
self.posts_download_loop(posts_to_download, profile_name, fast_update, post_filter,
|
self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
|
||||||
total_count=profile.mediacount, owner_profile=profile)
|
total_count=profile.mediacount, owner_profile=profile,
|
||||||
|
takewhile=posts_takewhile)
|
||||||
if latest_stamps is not None:
|
if latest_stamps is not None:
|
||||||
latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp)
|
latest_stamps.set_last_post_timestamp(profile_name, scraped_timestamp)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user