From f6731566cd23e368c10acd8cda6974d7933d524b Mon Sep 17 00:00:00 2001 From: Alexander Graf <17130992+aandergr@users.noreply.github.com> Date: Sat, 13 Mar 2021 11:17:56 +0100 Subject: [PATCH] Support resuming of downloading comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: André Koch-Kramer --- instaloader/instaloader.py | 39 ++++++++++++++++++++++++++++++++----- instaloader/nodeiterator.py | 4 ++-- instaloader/structures.py | 14 +++++++------ 3 files changed, 44 insertions(+), 13 deletions(-) diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index c71acb2..e665100 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -335,18 +335,47 @@ class Instaloader: combined_answers.extend(y['answers']) unique_comments_list[-1]['answers'] = get_unique_comments(combined_answers) return unique_comments_list + + def get_new_comments(new_comments, start): + for idx, comment in enumerate(new_comments, start=start+1): + if idx % 250 == 0: + self.context.log('{}'.format(idx), end='…', flush=True) + yield comment + + def save_comments(extended_comments): + unique_comments = get_unique_comments(extended_comments, combine_answers=True) + answer_ids = set(int(answer['id']) for comment in unique_comments for answer in comment.get('answers', [])) + with open(filename, 'w') as file: + file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, unique_comments)), + indent=4)) + + base_filename = filename filename += '_comments.json' try: with open(filename) as fp: comments = json.load(fp) except (FileNotFoundError, json.decoder.JSONDecodeError): comments = list() - comments.extend(_postcomment_asdict(comment) for comment in post.get_comments()) + + comments_iterator = post.get_comments() + try: + with resumable_iteration( + context=self.context, + iterator=comments_iterator, + load=load_structure_from_file, + save=save_structure_to_file, + format_path=lambda magic: "{}_{}_{}.json.xz".format(base_filename, self.resume_prefix, magic), + check_bbd=self.check_resume_bbd, + enabled=self.resume_prefix is not None + ) as (_is_resuming, start_index): + comments.extend(_postcomment_asdict(comment) + for comment in get_new_comments(comments_iterator, start_index)) + except (KeyboardInterrupt, AbortDownloadException): + if comments: + save_comments(comments) + raise if comments: - comments = get_unique_comments(comments, combine_answers=True) - answer_ids = set(int(answer['id']) for comment in comments for answer in comment.get('answers', [])) - with open(filename, 'w') as file: - file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, comments)), indent=4)) + save_comments(comments) self.context.log('comments', end=' ', flush=True) def save_caption(self, filename: str, mtime: datetime, caption: str) -> None: diff --git a/instaloader/nodeiterator.py b/instaloader/nodeiterator.py index 5ddf0b8..d8d74c3 100644 --- a/instaloader/nodeiterator.py +++ b/instaloader/nodeiterator.py @@ -5,7 +5,7 @@ import os from contextlib import contextmanager from datetime import datetime, timedelta from lzma import LZMAError -from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar +from typing import Any, Callable, Dict, Iterable, Iterator, NamedTuple, Optional, Tuple, TypeVar from .exceptions import AbortDownloadException, InvalidArgumentException, QueryReturnedBadRequestException from .instaloadercontext import InstaloaderContext @@ -204,7 +204,7 @@ class NodeIterator(Iterator[T]): @contextmanager def resumable_iteration(context: InstaloaderContext, - iterator: Iterator, + iterator: Iterable, load: Callable[[InstaloaderContext, str], Any], save: Callable[[FrozenNodeIterator, str], None], format_path: Callable[[str], str], diff --git a/instaloader/structures.py b/instaloader/structures.py index bfb2c77..6cca9e4 100644 --- a/instaloader/structures.py +++ b/instaloader/structures.py @@ -4,7 +4,7 @@ import re from base64 import b64decode, b64encode from collections import namedtuple from datetime import datetime -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterable, Iterator, List, Optional, Union from . import __version__ from .exceptions import * @@ -426,12 +426,15 @@ class Post: except KeyError: return self._field('edge_media_to_comment', 'count') - def get_comments(self) -> Iterator[PostComment]: + def get_comments(self) -> Iterable[PostComment]: r"""Iterate over all comments of the post. Each comment is represented by a PostComment namedtuple with fields text (string), created_at (datetime), id (int), owner (:class:`Profile`) and answers (:class:`~typing.Iterator`\ [:class:`PostCommentAnswer`]) if available. + + .. versionchanged:: 4.7 + Change return type to ``Iterable``. """ def _postcommentanswer(node): return PostCommentAnswer(id=int(node['id']), @@ -466,16 +469,15 @@ class Post: answers=_postcommentanswers(node)) if self.comments == 0: # Avoid doing additional requests if there are no comments - return + return [] comment_edges = self._field('edge_media_to_comment', 'edges') answers_count = sum([edge['node'].get('edge_threaded_comments', {}).get('count', 0) for edge in comment_edges]) if self.comments == len(comment_edges) + answers_count: # If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them - yield from (_postcomment(comment['node']) for comment in comment_edges) - return - yield from NodeIterator( + return [_postcomment(comment['node']) for comment in comment_edges] + return NodeIterator( self._context, '97b41c52301f77ce508f55e66d17620e', lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'],