Support resuming of downloading comments

Co-Authored-By: André Koch-Kramer <koch-kramer@web.de>
2024-08-18 20:59:38 +02:00 · 2021-03-13 11:17:56 +01:00 · 2021-03-13 11:17:56 +01:00 · f6731566cd
commit f6731566cd
parent cd13211603
3 changed files with 44 additions and 13 deletions
--- a/instaloader/instaloader.py
+++ b/instaloader/instaloader.py
@ -335,18 +335,47 @@ class Instaloader:
                            combined_answers.extend(y['answers'])
                        unique_comments_list[-1]['answers'] = get_unique_comments(combined_answers)
            return unique_comments_list
+
+        def get_new_comments(new_comments, start):
+            for idx, comment in enumerate(new_comments, start=start+1):
+                if idx % 250 == 0:
+                    self.context.log('{}'.format(idx), end='…', flush=True)
+                yield comment
+
+        def save_comments(extended_comments):
+            unique_comments = get_unique_comments(extended_comments, combine_answers=True)
+            answer_ids = set(int(answer['id']) for comment in unique_comments for answer in comment.get('answers', []))
+            with open(filename, 'w') as file:
+                file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, unique_comments)),
+                                      indent=4))
+
+        base_filename = filename
        filename += '_comments.json'
        try:
            with open(filename) as fp:
                comments = json.load(fp)
        except (FileNotFoundError, json.decoder.JSONDecodeError):
            comments = list()
-        comments.extend(_postcomment_asdict(comment) for comment in post.get_comments())
+
+        comments_iterator = post.get_comments()
+        try:
+            with resumable_iteration(
+                    context=self.context,
+                    iterator=comments_iterator,
+                    load=load_structure_from_file,
+                    save=save_structure_to_file,
+                    format_path=lambda magic: "{}_{}_{}.json.xz".format(base_filename, self.resume_prefix, magic),
+                    check_bbd=self.check_resume_bbd,
+                    enabled=self.resume_prefix is not None
+            ) as (_is_resuming, start_index):
+                comments.extend(_postcomment_asdict(comment)
+                                for comment in get_new_comments(comments_iterator, start_index))
+        except (KeyboardInterrupt, AbortDownloadException):
+            if comments:
+                save_comments(comments)
+            raise
        if comments:
-            comments = get_unique_comments(comments, combine_answers=True)
-            answer_ids = set(int(answer['id']) for comment in comments for answer in comment.get('answers', []))
-            with open(filename, 'w') as file:
-                file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, comments)), indent=4))
+            save_comments(comments)
            self.context.log('comments', end=' ', flush=True)

    def save_caption(self, filename: str, mtime: datetime, caption: str) -> None:
--- a/instaloader/nodeiterator.py
+++ b/instaloader/nodeiterator.py
@ -5,7 +5,7 @@ import os
 from contextlib import contextmanager
 from datetime import datetime, timedelta
 from lzma import LZMAError
-from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar
+from typing import Any, Callable, Dict, Iterable, Iterator, NamedTuple, Optional, Tuple, TypeVar

 from .exceptions import AbortDownloadException, InvalidArgumentException, QueryReturnedBadRequestException
 from .instaloadercontext import InstaloaderContext
@ -204,7 +204,7 @@ class NodeIterator(Iterator[T]):

@contextmanager
 def resumable_iteration(context: InstaloaderContext,
-                        iterator: Iterator,
+                        iterator: Iterable,
                        load: Callable[[InstaloaderContext, str], Any],
                        save: Callable[[FrozenNodeIterator, str], None],
                        format_path: Callable[[str], str],
--- a/instaloader/structures.py
+++ b/instaloader/structures.py
@ -4,7 +4,7 @@ import re
 from base64 import b64decode, b64encode
 from collections import namedtuple
 from datetime import datetime
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterable, Iterator, List, Optional, Union

 from . import __version__
 from .exceptions import *
@ -426,12 +426,15 @@ class Post:
        except KeyError:
            return self._field('edge_media_to_comment', 'count')

-    def get_comments(self) -> Iterator[PostComment]:
+    def get_comments(self) -> Iterable[PostComment]:
        r"""Iterate over all comments of the post.

        Each comment is represented by a PostComment namedtuple with fields text (string), created_at (datetime),
        id (int), owner (:class:`Profile`) and answers (:class:`~typing.Iterator`\ [:class:`PostCommentAnswer`])
        if available.
+
+        .. versionchanged:: 4.7
+           Change return type to ``Iterable``.
        """
        def _postcommentanswer(node):
            return PostCommentAnswer(id=int(node['id']),
@ -466,16 +469,15 @@ class Post:
                               answers=_postcommentanswers(node))
        if self.comments == 0:
            # Avoid doing additional requests if there are no comments
-            return
+            return []

        comment_edges = self._field('edge_media_to_comment', 'edges')
        answers_count = sum([edge['node'].get('edge_threaded_comments', {}).get('count', 0) for edge in comment_edges])

        if self.comments == len(comment_edges) + answers_count:
            # If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them
-            yield from (_postcomment(comment['node']) for comment in comment_edges)
-            return
-        yield from NodeIterator(
+            return [_postcomment(comment['node']) for comment in comment_edges]
+        return NodeIterator(
            self._context,
            '97b41c52301f77ce508f55e66d17620e',
            lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'],