1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-10-03 22:07:11 +02:00

Resume a previously-aborted post download loop (#732)

With this change, Instaloader is capable of resuming a previously-aborted download loop. To do so, it creates a JSON file within the target directory when interrupted, that contains all the necessary information to later resume that operation.

Resuming an interrupted download is supported for most, but not all targets. It is supported for:

- Regular profile posts,
- IGTV posts
- Saved posts,
- Tagged posts,
- Explore posts.
This commit is contained in:
Alexander Graf 2020-07-21 17:28:55 +02:00 committed by GitHub
parent c817d1901a
commit bc40b82f94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 531 additions and 130 deletions

View File

@ -49,7 +49,9 @@
directory accordingly,
- allows **fine-grained customization** of filters and where to store
downloaded media.
downloaded media,
- automatically **resumes previously-interrupted** download iterations.
::

View File

@ -226,6 +226,19 @@ Exceptions
.. autoexception:: TooManyRequestsException
Resumable Iterations
^^^^^^^^^^^^^^^^^^^^
.. versionadded:: 4.5
.. autoclass:: NodeIterator
:no-show-inheritance:
.. autoclass:: FrozenNodeIterator
:no-show-inheritance:
.. autofunction:: resumable_iteration
``InstaloaderContext`` (Low-level functions)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -241,4 +254,4 @@ Exceptions
.. autoclass:: RateController
:no-show-inheritance:
.. versionadded:: 4.5
.. versionadded:: 4.5

View File

@ -223,6 +223,29 @@ How to Download
``#hashtag`` or the profile name. Defaults to ``{date_utc}_UTC``.
See :ref:`filename-specification` for a list of supported tokens.
.. option:: --resume-prefix prefix
For many targets, Instaloader is capable of resuming a previously-aborted
download loop. To do so, it creates a JSON file within the target directory
when interrupted. This option controls the prefix for filenames that are
used to save the information to resume an interrupted download. The default
prefix is ``iterator``.
Resuming an interrupted download is supported for most, but not all targets.
JSON files with resume information are always compressed, regardless of
:option:`--no-compress-json`.
This feature is turned off entirely with :option:`--no-resume`.
.. versionadded:: 4.5
.. option:: --no-resume
Do not resume a previously-aborted download iteration, and do not save such
information when interrupted.
.. versionadded:: 4.5
.. option:: --user-agent USER_AGENT
User Agent to use for HTTP requests. Per default, Instaloader pretends being

View File

@ -50,6 +50,8 @@ autodoc_member_order = 'bysource'
intersphinx_mapping = {'python': ('https://docs.python.org/3', None),
'requests': ('https://requests.kennethreitz.org/en/master/', None)}
nitpick_ignore = [('py:class', 'typing.Tuple')]
current_release = subprocess.check_output(["git", "describe", "--abbrev=0"]).decode("ascii")[1:-1]
date_format = "%e %b %Y" if platform.system() != "Windows" else "%d %b %Y"
current_release_date = subprocess.check_output(

View File

@ -40,6 +40,8 @@ See :ref:`install` for more options on how to install Instaloader.
- allows **fine-grained customization** of filters and where to store
downloaded media,
- automatically **resumes previously-interrupted** download iterations,
- is free `open source <https://github.com/instaloader/instaloader>`__
software written in Python.

View File

@ -15,5 +15,6 @@ else:
from .exceptions import *
from .instaloader import Instaloader
from .instaloadercontext import InstaloaderContext, RateController
from .nodeiterator import NodeIterator, FrozenNodeIterator, resumable_iteration
from .structures import (Hashtag, Highlight, Post, PostSidecarNode, PostComment, PostCommentAnswer, PostLocation,
Profile, Story, StoryItem, TopSearchResults, load_structure_from_file, save_structure_to_file)

View File

@ -351,6 +351,13 @@ def main():
'--dirname-pattern. {profile} is replaced by the profile name,'
'{target} is replaced by the target you specified, i.e. either :feed'
'#hashtag or the profile name. Defaults to \'{date_utc}_UTC\'')
g_how.add_argument('--resume-prefix', metavar='PREFIX',
help='Prefix for filenames that are used to save the information to resume an interrupted '
'download.')
g_how.add_argument('--no-resume', action='store_true',
help='Do not resume a previously-aborted download iteration, and do not save such information '
'when interrupted.')
g_how.add_argument('--use-aged-resume-files', action='store_true', help=SUPPRESS)
g_how.add_argument('--user-agent',
help='User Agent to use for HTTP requests. Defaults to \'{}\'.'.format(default_user_agent()))
g_how.add_argument('-S', '--no-sleep', action='store_true', help=SUPPRESS)
@ -394,6 +401,10 @@ def main():
raise SystemExit("--no-captions and --post-metadata-txt or --storyitem-metadata-txt given; "
"That contradicts.")
if args.no_resume and args.resume_prefix:
raise SystemExit("--no-resume and --resume-prefix given; That contradicts.")
resume_prefix = (args.resume_prefix if args.resume_prefix else 'iterator') if not args.no_resume else None
if args.no_pictures and args.fast_update:
raise SystemExit('--no-pictures and --fast-update cannot be used together.')
@ -412,7 +423,9 @@ def main():
post_metadata_txt_pattern=post_metadata_txt_pattern,
storyitem_metadata_txt_pattern=storyitem_metadata_txt_pattern,
max_connection_attempts=args.max_connection_attempts,
request_timeout=args.request_timeout)
request_timeout=args.request_timeout,
resume_prefix=resume_prefix,
check_resume_bbd=not args.use_aged_resume_files)
_main(loader,
args.profile,
username=args.login.lower() if args.login is not None else None,

View File

@ -20,8 +20,9 @@ import urllib3 # type: ignore
from .exceptions import *
from .instaloadercontext import InstaloaderContext, RateController
from .nodeiterator import NodeIterator, resumable_iteration
from .structures import (Hashtag, Highlight, JsonExportable, Post, PostLocation, Profile, Story, StoryItem,
save_structure_to_file)
load_structure_from_file, save_structure_to_file)
def get_default_session_filename(username: str) -> str:
@ -154,6 +155,8 @@ class Instaloader:
:param max_connection_attempts: :option:`--max-connection-attempts`
:param request_timeout: :option:`--request-timeout`, set per-request timeout (seconds)
:param rate_controller: Generator for a :class:`RateController` to override rate controlling behavior
:param resume_prefix: :option:`--resume-prefix`, or None for :option:`--no-resume`.
:param check_resume_bbd: Whether to check the date of expiry of resume files and reject them if expired.
.. attribute:: context
@ -177,7 +180,9 @@ class Instaloader:
storyitem_metadata_txt_pattern: str = None,
max_connection_attempts: int = 3,
request_timeout: Optional[float] = None,
rate_controller: Optional[Callable[[InstaloaderContext], RateController]] = None):
rate_controller: Optional[Callable[[InstaloaderContext], RateController]] = None,
resume_prefix: Optional[str] = "iterator",
check_resume_bbd: bool = True):
self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts,
request_timeout, rate_controller)
@ -196,6 +201,8 @@ class Instaloader:
else post_metadata_txt_pattern
self.storyitem_metadata_txt_pattern = '' if storyitem_metadata_txt_pattern is None \
else storyitem_metadata_txt_pattern
self.resume_prefix = resume_prefix
self.check_resume_bbd = check_resume_bbd
@contextmanager
def anonymous_copy(self):
@ -216,7 +223,9 @@ class Instaloader:
post_metadata_txt_pattern=self.post_metadata_txt_pattern,
storyitem_metadata_txt_pattern=self.storyitem_metadata_txt_pattern,
max_connection_attempts=self.context.max_connection_attempts,
request_timeout=self.context.request_timeout)
request_timeout=self.context.request_timeout,
resume_prefix=self.resume_prefix,
check_resume_bbd=self.check_resume_bbd)
yield new_loader
self.context.error_log.extend(new_loader.context.error_log)
new_loader.context.error_log = [] # avoid double-printing of errors
@ -356,6 +365,24 @@ class Instaloader:
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
self.context.log('geo', end=' ', flush=True)
def format_filename_within_target_path(self,
target: Union[str, Path],
owner_profile: Optional[Profile],
identifier: str,
name_suffix: str,
extension: str):
"""Returns a filename within the target path.
.. versionadded:: 4.5"""
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
format_string_contains_key(self.dirname_pattern, 'target'))):
profile_str = owner_profile.username.lower() if owner_profile is not None else target
return os.path.join(self.dirname_pattern.format(profile=profile_str, target=target),
'{0}_{1}.{2}'.format(identifier, name_suffix, extension))
else:
return os.path.join(self.dirname_pattern.format(),
'{0}_{1}_{2}.{3}'.format(target, identifier, name_suffix, extension))
@_retry_on_connection_error
def download_title_pic(self, url: str, target: Union[str, Path], name_suffix: str, owner_profile: Optional[Profile],
_attempt: int = 1) -> None:
@ -376,16 +403,7 @@ class Instaloader:
else:
pic_bytes = http_response.content
pic_identifier = md5(pic_bytes).hexdigest()[:16]
pic_extension = 'jpg'
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
format_string_contains_key(self.dirname_pattern, 'target'))):
profile_str = owner_profile.username.lower() if owner_profile is not None else target
filename = os.path.join(self.dirname_pattern.format(profile=profile_str,
target=target),
'{0}_{1}.{2}'.format(pic_identifier, name_suffix, pic_extension))
else:
filename = os.path.join(self.dirname_pattern.format(),
'{0}_{1}_{2}.{3}'.format(target, pic_identifier, name_suffix, pic_extension))
filename = self.format_filename_within_target_path(target, owner_profile, pic_identifier, name_suffix, 'jpg')
content_length = http_response.headers.get('Content-Length', None)
if os.path.isfile(filename) and (not self.context.is_logged_in or
(content_length is not None and
@ -705,59 +723,75 @@ class Instaloader:
fast_update: bool = False,
post_filter: Optional[Callable[[Post], bool]] = None,
max_count: Optional[int] = None,
total_count: Optional[int] = None) -> None:
total_count: Optional[int] = None,
owner_profile: Optional[Profile] = None) -> None:
"""
Download the Posts returned by given Post Iterator.
..versionadded:: 4.4
..versionchanged:: 4.5
Transparently resume an aborted operation if `posts` is a :class:`NodeIterator`.
:param posts: Post Iterator to loop through.
:param target: Target name
:param fast_update: :option:`--fast-update`
:param post_filter: :option:`--post-filter`
:param max_count: Maximum count of Posts to download (:option:`--count`)
:param total_count: Total number of posts returned by given iterator
:param target: Target name.
:param fast_update: :option:`--fast-update`.
:param post_filter: :option:`--post-filter`.
:param max_count: Maximum count of Posts to download (:option:`--count`).
:param total_count: Total number of posts returned by given iterator.
:param owner_profile: Associated profile, if any.
"""
for number, post in enumerate(posts):
if max_count is not None and number >= max_count:
break
if total_count is not None:
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + 1, total_count,
w=len(str(total_count))),
end="", flush=True)
else:
if max_count is not None:
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + 1, max_count,
w=len(str(max_count))),
displayed_count = (max_count if total_count is None or max_count is not None and max_count < total_count
else total_count)
with resumable_iteration(
context=self.context,
iterator=posts,
load=load_structure_from_file,
save=save_structure_to_file,
format_path=lambda magic: self.format_filename_within_target_path(
target, owner_profile, self.resume_prefix or '', magic, 'json.xz'
),
check_bbd=self.check_resume_bbd,
enabled=self.resume_prefix is not None
) as resume_info:
is_resuming, start_index = resume_info
for number, post in enumerate(posts):
if max_count is not None and number + start_index >= max_count:
break
if displayed_count is not None:
self.context.log("[{0:{w}d}/{1:{w}d}] ".format(number + start_index + 1, displayed_count,
w=len(str(displayed_count))),
end="", flush=True)
else:
self.context.log("[{:3d}] ".format(number + 1), end="", flush=True)
if post_filter is not None:
try:
if not post_filter(post):
self.context.log("{} skipped".format(post))
continue
except (InstaloaderException, KeyError, TypeError) as err:
self.context.error("{} skipped. Filter evaluation failed: {}".format(post, err))
continue
with self.context.error_catcher("Download {} of {}".format(post, target)):
# The PostChangedException gets raised if the Post's id/shortcode changed while obtaining
# additional metadata. This is most likely the case if a HTTP redirect takes place while
# resolving the shortcode URL.
# The `post_changed` variable keeps the fast-update functionality alive: A Post which is
# obained after a redirect has probably already been downloaded as a previous Post of the
# same Profile.
# Observed in issue #225: https://github.com/instaloader/instaloader/issues/225
post_changed = False
while True:
self.context.log("[{:3d}] ".format(number + start_index + 1), end="", flush=True)
if post_filter is not None:
try:
downloaded = self.download_post(post, target=target)
break
except PostChangedException:
post_changed = True
if not post_filter(post):
self.context.log("{} skipped".format(post))
continue
except (InstaloaderException, KeyError, TypeError) as err:
self.context.error("{} skipped. Filter evaluation failed: {}".format(post, err))
continue
if fast_update and not downloaded and not post_changed:
break
with self.context.error_catcher("Download {} of {}".format(post, target)):
# The PostChangedException gets raised if the Post's id/shortcode changed while obtaining
# additional metadata. This is most likely the case if a HTTP redirect takes place while
# resolving the shortcode URL.
# The `post_changed` variable keeps the fast-update functionality alive: A Post which is
# obained after a redirect has probably already been downloaded as a previous Post of the
# same Profile.
# Observed in issue #225: https://github.com/instaloader/instaloader/issues/225
post_changed = False
while True:
try:
downloaded = self.download_post(post, target=target)
break
except PostChangedException:
post_changed = True
continue
if fast_update and not downloaded and not post_changed:
# disengage fast_update for first post when resuming
if not is_resuming or number > 0:
break
@_requires_login
def get_feed_posts(self) -> Iterator[Post]:
@ -817,8 +851,10 @@ class Instaloader:
"""
self.context.log("Retrieving saved posts...")
assert self.context.username is not None # safe due to @_requires_login; required by typechecker
self.posts_download_loop(Profile.from_username(self.context, self.context.username).get_saved_posts(), ":saved",
fast_update, post_filter, max_count=max_count)
node_iterator = Profile.from_username(self.context, self.context.username).get_saved_posts()
self.posts_download_loop(node_iterator, ":saved",
fast_update, post_filter,
max_count=max_count, total_count=node_iterator.count)
@_requires_login
def get_location_posts(self, location: str) -> Iterator[Post]:
@ -873,18 +909,20 @@ class Instaloader:
max_count=max_count)
@_requires_login
def get_explore_posts(self) -> Iterator[Post]:
def get_explore_posts(self) -> NodeIterator[Post]:
"""Get Posts which are worthy of exploring suggested by Instagram.
:return: Iterator over Posts of the user's suggested posts.
:rtype: NodeIterator[Post]
:raises LoginRequiredException: If called without being logged in.
"""
data = self.context.get_json('explore/', {})
yield from (Post(self.context, node)
for node in self.context.graphql_node_list("df0dcc250c2b18d9fd27c5581ef33c7c",
{}, 'https://www.instagram.com/explore/',
lambda d: d['data']['user']['edge_web_discover_media'],
data.get('rhx_gis')))
return NodeIterator(
self.context,
'df0dcc250c2b18d9fd27c5581ef33c7c',
lambda d: d['data']['user']['edge_web_discover_media'],
lambda n: Post(self.context, n),
query_referer='https://www.instagram.com/explore/',
)
def get_hashtag_posts(self, hashtag: str) -> Iterator[Post]:
"""Get Posts associated with a #hashtag.
@ -955,7 +993,7 @@ class Instaloader:
.. versionadded:: 4.3"""
self.context.log("Retrieving IGTV videos for profile {}.".format(profile.username))
self.posts_download_loop(profile.get_igtv_posts(), profile.username, fast_update, post_filter,
total_count=profile.igtvcount)
total_count=profile.igtvcount, owner_profile=profile)
def _get_id_filename(self, profile_name: str) -> str:
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
@ -1110,7 +1148,7 @@ class Instaloader:
if posts:
self.context.log("Retrieving posts from profile {}.".format(profile_name))
self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
total_count=profile.mediacount)
total_count=profile.mediacount, owner_profile=profile)
if stories and profiles:
with self.context.error_catcher("Download stories"):
@ -1190,7 +1228,7 @@ class Instaloader:
# Iterate over pictures and download them
self.context.log("Retrieving posts from profile {}.".format(profile_name))
self.posts_download_loop(profile.get_posts(), profile_name, fast_update, post_filter,
total_count=profile.mediacount)
total_count=profile.mediacount, owner_profile=profile)
def interactive_login(self, username: str) -> None:
"""Logs in and internally stores session, asking user for password interactively.

View File

@ -428,7 +428,12 @@ class InstaloaderContext:
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]],
rhx_gis: Optional[str] = None,
first_data: Optional[Dict[str, Any]] = None) -> Iterator[Dict[str, Any]]:
"""Retrieve a list of GraphQL nodes."""
"""
Retrieve a list of GraphQL nodes.
..deprecated:: 4.5
Use :class:`NodeIterator` instead, which provides more functionality.
"""
def _query():
query_variables['first'] = self._graphql_page_length

261
instaloader/nodeiterator.py Normal file
View File

@ -0,0 +1,261 @@
import base64
import hashlib
import json
import os
from contextlib import contextmanager
from datetime import datetime, timedelta
from lzma import LZMAError
from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar
from .exceptions import InvalidArgumentException, QueryReturnedBadRequestException
from .instaloadercontext import InstaloaderContext
FrozenNodeIterator = NamedTuple('FrozenNodeIterator',
[('query_hash', str),
('query_variables', Dict),
('query_referer', Optional[str]),
('context_username', Optional[str]),
('total_index', int),
('best_before', Optional[float]),
('remaining_data', Optional[Dict])])
FrozenNodeIterator.__doc__ = \
"""A serializable representation of a :class:`NodeIterator` instance, saving its iteration state."""
FrozenNodeIterator.query_hash.__doc__ = """The GraphQL ``query_hash`` parameter."""
FrozenNodeIterator.query_variables.__doc__ = """The GraphQL ``query_variables`` parameter."""
FrozenNodeIterator.query_referer.__doc__ = """The HTTP referer used for the GraphQL query."""
FrozenNodeIterator.context_username.__doc__ = """The username who created the iterator, or ``None``."""
FrozenNodeIterator.total_index.__doc__ = """Number of items that have already been returned."""
FrozenNodeIterator.best_before.__doc__ = """Date when parts of the stored nodes might have expired."""
FrozenNodeIterator.remaining_data.__doc__ = \
"""The already-retrieved, yet-unprocessed ``edges`` and the ``page_info`` at time of freezing."""
T = TypeVar('T')
class NodeIterator(Iterator[T]):
"""
Iterate the nodes within edges in a GraphQL pagination. Instances of this class are returned by many (but not all)
of Instaloader's :class:`Post`-returning functions (such as :meth:`Profile.get_posts` etc.).
What makes this iterator special is its ability to freeze/store its current state, e.g. to interrupt an iteration,
and later thaw/resume from where it left off.
You can freeze a NodeIterator with :meth:`NodeIterator.freeze`::
post_iterator = profile.get_posts()
try:
for post in post_iterator:
do_something_with(post)
except KeyboardInterrupt:
save("resume_information.json", post_iterator.freeze())
and later reuse it with :meth:`NodeIterator.thaw` on an equally-constructed NodeIterator::
post_iterator = profile.get_posts()
post_iterator.thaw(load("resume_information.json"))
A :class:`FrozenNodeIterator` can only be thawn with a matching NodeIterator, i.e. a NodeIterator instance that has
been constructed with the same parameters as the instance that is represented by the :class:`FrozenNodeIterator` in
question. This is to ensure that an iteration cannot be resumed in a wrong, unmatching loop. As a quick way to
distinguish iterators that are saved e.g. in files, there is the :attr:`NodeIterator.magic` string: Two
NodeIterators are matching if they have the same magic.
See also :func:`resumable_iteration` for a high-level context manager that handles a resumable iteration.
"""
_graphql_page_length = 50
shelf_life = timedelta(days=29)
def __init__(self,
context: InstaloaderContext,
query_hash: str,
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]],
node_wrapper: Callable[[Dict], T],
query_variables: Optional[Dict[str, Any]] = None,
query_referer: Optional[str] = None,
first_data: Optional[Dict[str, Any]] = None):
self._context = context
self._query_hash = query_hash
self._edge_extractor = edge_extractor
self._node_wrapper = node_wrapper
self._query_variables = query_variables if query_variables is not None else {}
self._query_referer = query_referer
self._data = first_data
self._page_index = 0
self._total_index = 0
self._best_before = (None if first_data is None else
datetime.now() + NodeIterator.shelf_life)
def _query(self, after: Optional[str] = None) -> Dict:
pagination_variables = {'first': NodeIterator._graphql_page_length} # type: Dict[str, Any]
if after is not None:
pagination_variables['after'] = after
try:
data = self._edge_extractor(
self._context.graphql_query(
self._query_hash, {**self._query_variables, **pagination_variables}, self._query_referer
)
)
self._best_before = datetime.now() + NodeIterator.shelf_life
return data
except QueryReturnedBadRequestException:
new_page_length = int(NodeIterator._graphql_page_length / 2)
if new_page_length >= 12:
NodeIterator._graphql_page_length = new_page_length
self._context.error("HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.",
repeat_at_end=False)
return self._query(after)
else:
raise
def __iter__(self):
return self
def __next__(self):
if self._data is None:
self._data = self._query()
if self._page_index < len(self._data['edges']):
node = self._data['edges'][self._page_index]['node']
page_index, total_index = self._page_index, self._total_index
try:
self._page_index += 1
self._total_index += 1
except KeyboardInterrupt:
self._page_index, self._total_index = page_index, total_index
raise
return self._node_wrapper(node)
if self._data['page_info']['has_next_page']:
query_response = self._query(self._data['page_info']['end_cursor'])
page_index, data = self._page_index, self._data
try:
self._page_index = 0
self._data = query_response
except KeyboardInterrupt:
self._page_index, self._data = page_index, data
raise
return self.__next__()
raise StopIteration()
@property
def count(self) -> Optional[int]:
"""The ``count`` as returned by Instagram. This is not always the total count this iterator will yield."""
return self._data.get('count') if self._data is not None else None
@property
def total_index(self) -> int:
"""Number of items that have already been returned."""
return self._total_index
@property
def magic(self) -> str:
"""Magic string for easily identifying a matching iterator file for resuming (hash of some parameters)."""
if 'blake2b' not in hashlib.algorithms_available:
magic_hash = hashlib.new('sha224')
else:
# Use blake2b when possible, i.e. on Python >= 3.6.
magic_hash = hashlib.blake2b(digest_size=6) # type:ignore # pylint: disable=no-member
magic_hash.update(json.dumps(
[self._query_hash, self._query_variables, self._query_referer, self._context.username]
).encode())
return base64.urlsafe_b64encode(magic_hash.digest()).decode()
def freeze(self) -> FrozenNodeIterator:
"""Freeze the iterator for later resuming."""
remaining_data = None
if self._data is not None:
remaining_data = {**self._data,
'edges': (self._data['edges'][(max(self._page_index - 1, 0)):])}
return FrozenNodeIterator(
query_hash=self._query_hash,
query_variables=self._query_variables,
query_referer=self._query_referer,
context_username=self._context.username,
total_index=max(self.total_index - 1, 0),
best_before=self._best_before.timestamp() if self._best_before else None,
remaining_data=remaining_data,
)
def thaw(self, frozen: FrozenNodeIterator) -> None:
"""Use this iterator for resuming from earlier iteration."""
if self._total_index or self._page_index:
raise InvalidArgumentException("thaw() called on already-used iterator.")
if (self._query_hash != frozen.query_hash or
self._query_variables != frozen.query_variables or
self._query_referer != frozen.query_referer or
self._context.username != frozen.context_username):
raise InvalidArgumentException("Mismatching resume information.")
self._total_index = frozen.total_index
self._best_before = datetime.fromtimestamp(frozen.best_before) if frozen.best_before else None
self._data = frozen.remaining_data
@contextmanager
def resumable_iteration(context: InstaloaderContext,
iterator: Iterator,
load: Callable[[InstaloaderContext, str], Any],
save: Callable[[FrozenNodeIterator, str], None],
format_path: Callable[[str], str],
check_bbd: bool = True,
enabled: bool = True) -> Iterator[Tuple[bool, int]]:
"""
High-level context manager to handle a resumable iteration that can be interrupted with a KeyboardInterrupt.
It can be used as follows to automatically load a previously-saved state into the iterator, save the iterator's
state when interrupted, and delete the resume file upon completion::
post_iterator = profile.get_posts()
with resumable_iteration(
context=L.context,
iterator=post_iterator,
load=lambda _, path: FrozenNodeIterator(**json.load(open(path))),
save=lambda fni, path: json.dump(fni._asdict(), open(path, 'w')),
format_path=lambda magic: "resume_info_{}.json".format(magic)
) as resume_info:
is_resuming, start_index = resume_info
for post in post_iterator:
do_something_with(post)
It yields a tuple (is_resuming, start_index).
When the passed iterator is not a :class:`NodeIterator`, it behaves as if ``resumable_iteration`` was not used,
just executing the inner body.
:param context: The :class:`InstaloaderContext`.
:param iterator: The fresh :class:`NodeIterator`.
:param load: Loads a FrozenNodeIterator from given path. The object is ignored if it has a different type.
:param save: Saves the given FrozenNodeIterator to the given path.
:param format_path: Returns the path to the resume file for the given magic.
:param check_bbd: Whether to check the best before date and reject an expired FrozenNodeIterator.
:param enabled: Set to False to disable all functionality and simply execute the inner body.
"""
if not enabled or not isinstance(iterator, NodeIterator):
yield False, 0
return
is_resuming = False
start_index = 0
resume_file_path = format_path(iterator.magic)
resume_file_exists = os.path.isfile(resume_file_path)
if resume_file_exists:
try:
fni = load(context, resume_file_path)
if not isinstance(fni, FrozenNodeIterator):
raise InvalidArgumentException("Invalid type.")
if check_bbd and fni.best_before and datetime.fromtimestamp(fni.best_before) < datetime.now():
raise InvalidArgumentException("\"Best before\" date exceeded.")
iterator.thaw(fni)
is_resuming = True
start_index = iterator.total_index
context.log("Resuming from {}.".format(resume_file_path))
except (InvalidArgumentException, LZMAError, json.decoder.JSONDecodeError) as exc:
context.error("Warning: Not resuming from {}: {}".format(resume_file_path, exc))
try:
yield is_resuming, start_index
except KeyboardInterrupt:
os.makedirs(os.path.dirname(resume_file_path), exist_ok=True)
save(iterator.freeze(), resume_file_path)
context.log("\nSaved resume information to {}.".format(resume_file_path))
raise
if resume_file_exists:
os.unlink(resume_file_path)
context.log("Iteration complete, deleted resume information file {}.".format(resume_file_path))

View File

@ -9,7 +9,7 @@ from typing import Any, Dict, Iterator, List, Optional, Union
from . import __version__
from .exceptions import *
from .instaloadercontext import InstaloaderContext
from .nodeiterator import FrozenNodeIterator, NodeIterator
PostSidecarNode = namedtuple('PostSidecarNode', ['is_video', 'display_url', 'video_url'])
PostSidecarNode.__doc__ = "Item of a Sidecar Post."
@ -402,11 +402,14 @@ class Post:
# If the answer's metadata already contains all comments, don't do GraphQL requests to obtain them
yield from (_postcommentanswer(comment['node']) for comment in answer_edges)
return
yield from (_postcommentanswer(answer_node) for answer_node in
self._context.graphql_node_list("51fdd02b67508306ad4484ff574a0b62",
{'comment_id': node['id']},
'https://www.instagram.com/p/' + self.shortcode + '/',
lambda d: d['data']['comment']['edge_threaded_comments']))
yield from NodeIterator(
self._context,
'51fdd02b67508306ad4484ff574a0b62',
lambda d: d['data']['comment']['edge_threaded_comments'],
_postcommentanswer,
{'comment_id': node['id']},
'https://www.instagram.com/p/{0}/'.format(self.shortcode),
)
def _postcomment(node):
return PostComment(*_postcommentanswer(node),
@ -422,12 +425,14 @@ class Post:
# If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them
yield from (_postcomment(comment['node']) for comment in comment_edges)
return
yield from (_postcomment(node) for node in
self._context.graphql_node_list(
"97b41c52301f77ce508f55e66d17620e",
{'shortcode': self.shortcode},
'https://www.instagram.com/p/' + self.shortcode + '/',
lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment']))
yield from NodeIterator(
self._context,
'97b41c52301f77ce508f55e66d17620e',
lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'],
_postcomment,
{'shortcode': self.shortcode},
'https://www.instagram.com/p/{0}/'.format(self.shortcode),
)
def get_likes(self) -> Iterator['Profile']:
"""Iterate over all likes of the post. A :class:`Profile` instance of each likee is yielded."""
@ -439,10 +444,14 @@ class Post:
# If the Post's metadata already contains all likes, don't do GraphQL requests to obtain them
yield from (Profile(self._context, like['node']) for like in likes_edges)
return
yield from (Profile(self._context, node) for node in
self._context.graphql_node_list("1cb6ec562846122743b61e492c85999f", {'shortcode': self.shortcode},
'https://www.instagram.com/p/' + self.shortcode + '/',
lambda d: d['data']['shortcode_media']['edge_liked_by']))
yield from NodeIterator(
self._context,
'1cb6ec562846122743b61e492c85999f',
lambda d: d['data']['shortcode_media']['edge_liked_by'],
lambda n: Profile(self._context, n),
{'shortcode': self.shortcode},
'https://www.instagram.com/p/{0}/'.format(self.shortcode),
)
@property
def is_sponsored(self) -> bool:
@ -770,80 +779,110 @@ class Profile:
Use :attr:`profile_pic_url`."""
return self.profile_pic_url
def get_posts(self) -> Iterator[Post]:
"""Retrieve all posts from a profile."""
self._obtain_metadata()
yield from (Post(self._context, node, self) for node in
self._context.graphql_node_list("472f257a40c653c64c666ce877d59d2b",
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
lambda d: d['data']['user']['edge_owner_to_timeline_media'],
first_data=self._metadata('edge_owner_to_timeline_media')))
def get_posts(self) -> NodeIterator[Post]:
"""Retrieve all posts from a profile.
def get_saved_posts(self) -> Iterator[Post]:
"""Get Posts that are marked as saved by the user."""
:rtype:NodeIterator[Post]"""
self._obtain_metadata()
return NodeIterator(
self._context,
'472f257a40c653c64c666ce877d59d2b',
lambda d: d['data']['user']['edge_owner_to_timeline_media'],
lambda n: Post(self._context, n, self),
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
self._metadata('edge_owner_to_timeline_media'),
)
def get_saved_posts(self) -> NodeIterator[Post]:
"""Get Posts that are marked as saved by the user.
:rtype:NodeIterator[Post]"""
if self.username != self._context.username:
raise LoginRequiredException("--login={} required to get that profile's saved posts.".format(self.username))
self._obtain_metadata()
yield from (Post(self._context, node) for node in
self._context.graphql_node_list("f883d95537fbcd400f466f63d42bd8a1",
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
lambda d: d['data']['user']['edge_saved_media'],
first_data=self._metadata('edge_saved_media')))
return NodeIterator(
self._context,
'f883d95537fbcd400f466f63d42bd8a1',
lambda d: d['data']['user']['edge_saved_media'],
lambda n: Post(self._context, n),
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
self._metadata('edge_saved_media'),
)
def get_tagged_posts(self) -> Iterator[Post]:
def get_tagged_posts(self) -> NodeIterator[Post]:
"""Retrieve all posts where a profile is tagged.
:rtype: NodeIterator[Post]
.. versionadded:: 4.0.7"""
self._obtain_metadata()
yield from (Post(self._context, node, self if int(node['owner']['id']) == self.userid else None) for node in
self._context.graphql_node_list("e31a871f7301132ceaab56507a66bbb7",
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
lambda d: d['data']['user']['edge_user_to_photos_of_you']))
return NodeIterator(
self._context,
'e31a871f7301132ceaab56507a66bbb7',
lambda d: d['data']['user']['edge_user_to_photos_of_you'],
lambda n: Post(self._context, n, self if int(n['owner']['id']) == self.userid else None),
{'id': self.userid},
'https://www.instagram.com/{0}/'.format(self.username),
)
def get_igtv_posts(self) -> Iterator[Post]:
def get_igtv_posts(self) -> NodeIterator[Post]:
"""Retrieve all IGTV posts.
:rtype: NodeIterator[Post]
.. versionadded:: 4.3"""
self._obtain_metadata()
yield from (Post(self._context, node, self) for node in
self._context.graphql_node_list('bc78b344a68ed16dd5d7f264681c4c76',
{'id': self.userid},
'https://www.instagram.com/{0}/channel/'.format(self.username),
lambda d: d['data']['user']['edge_felix_video_timeline'],
first_data=self._metadata('edge_felix_video_timeline')))
return NodeIterator(
self._context,
'bc78b344a68ed16dd5d7f264681c4c76',
lambda d: d['data']['user']['edge_felix_video_timeline'],
lambda n: Post(self._context, n, self),
{'id': self.userid},
'https://www.instagram.com/{0}/channel/'.format(self.username),
self._metadata('edge_felix_video_timeline'),
)
def get_followers(self) -> Iterator['Profile']:
def get_followers(self) -> NodeIterator['Profile']:
"""
Retrieve list of followers of given profile.
To use this, one needs to be logged in and private profiles has to be followed.
:rtype:NodeIterator[Profile]
"""
if not self._context.is_logged_in:
raise LoginRequiredException("--login required to get a profile's followers.")
self._obtain_metadata()
yield from (Profile(self._context, node) for node in
self._context.graphql_node_list("37479f2b8209594dde7facb0d904896a",
{'id': str(self.userid)},
'https://www.instagram.com/' + self.username + '/',
lambda d: d['data']['user']['edge_followed_by']))
return NodeIterator(
self._context,
'37479f2b8209594dde7facb0d904896a',
lambda d: d['data']['user']['edge_followed_by'],
lambda n: Profile(self._context, n),
{'id': str(self.userid)},
'https://www.instagram.com/{0}/'.format(self.username),
)
def get_followees(self) -> Iterator['Profile']:
def get_followees(self) -> NodeIterator['Profile']:
"""
Retrieve list of followees (followings) of given profile.
To use this, one needs to be logged in and private profiles has to be followed.
:rtype:NodeIterator[Profile]
"""
if not self._context.is_logged_in:
raise LoginRequiredException("--login required to get a profile's followees.")
self._obtain_metadata()
yield from (Profile(self._context, node) for node in
self._context.graphql_node_list("58712303d941c6855d4e888c5f0cd22f",
{'id': str(self.userid)},
'https://www.instagram.com/' + self.username + '/',
lambda d: d['data']['user']['edge_follow']))
return NodeIterator(
self._context,
'58712303d941c6855d4e888c5f0cd22f',
lambda d: d['data']['user']['edge_follow'],
lambda n: Profile(self._context, n),
{'id': str(self.userid)},
'https://www.instagram.com/{0}/'.format(self.username),
)
def get_similar_accounts(self) -> Iterator['Profile']:
"""
@ -1398,7 +1437,7 @@ class TopSearchResults:
return self._searchstring
JsonExportable = Union[Post, Profile, StoryItem, Hashtag]
JsonExportable = Union[Post, Profile, StoryItem, Hashtag, FrozenNodeIterator]
def save_structure_to_file(structure: JsonExportable, filename: str) -> None:
@ -1447,6 +1486,8 @@ def load_structure_from_file(context: InstaloaderContext, filename: str) -> Json
return StoryItem(context, json_structure['node'])
elif node_type == "Hashtag":
return Hashtag(context, json_structure['node'])
elif node_type == "FrozenNodeIterator":
return FrozenNodeIterator(**json_structure['node'])
else:
raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))
elif 'shortcode' in json_structure: