1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-08-18 20:59:38 +02:00

Support resuming of downloading comments

Co-Authored-By: André Koch-Kramer <koch-kramer@web.de>
This commit is contained in:
Alexander Graf 2021-03-13 11:17:56 +01:00
parent cd13211603
commit f6731566cd
3 changed files with 44 additions and 13 deletions

View File

@ -335,18 +335,47 @@ class Instaloader:
combined_answers.extend(y['answers'])
unique_comments_list[-1]['answers'] = get_unique_comments(combined_answers)
return unique_comments_list
def get_new_comments(new_comments, start):
for idx, comment in enumerate(new_comments, start=start+1):
if idx % 250 == 0:
self.context.log('{}'.format(idx), end='', flush=True)
yield comment
def save_comments(extended_comments):
unique_comments = get_unique_comments(extended_comments, combine_answers=True)
answer_ids = set(int(answer['id']) for comment in unique_comments for answer in comment.get('answers', []))
with open(filename, 'w') as file:
file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, unique_comments)),
indent=4))
base_filename = filename
filename += '_comments.json'
try:
with open(filename) as fp:
comments = json.load(fp)
except (FileNotFoundError, json.decoder.JSONDecodeError):
comments = list()
comments.extend(_postcomment_asdict(comment) for comment in post.get_comments())
comments_iterator = post.get_comments()
try:
with resumable_iteration(
context=self.context,
iterator=comments_iterator,
load=load_structure_from_file,
save=save_structure_to_file,
format_path=lambda magic: "{}_{}_{}.json.xz".format(base_filename, self.resume_prefix, magic),
check_bbd=self.check_resume_bbd,
enabled=self.resume_prefix is not None
) as (_is_resuming, start_index):
comments.extend(_postcomment_asdict(comment)
for comment in get_new_comments(comments_iterator, start_index))
except (KeyboardInterrupt, AbortDownloadException):
if comments:
save_comments(comments)
raise
if comments:
comments = get_unique_comments(comments, combine_answers=True)
answer_ids = set(int(answer['id']) for comment in comments for answer in comment.get('answers', []))
with open(filename, 'w') as file:
file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, comments)), indent=4))
save_comments(comments)
self.context.log('comments', end=' ', flush=True)
def save_caption(self, filename: str, mtime: datetime, caption: str) -> None:

View File

@ -5,7 +5,7 @@ import os
from contextlib import contextmanager
from datetime import datetime, timedelta
from lzma import LZMAError
from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar
from typing import Any, Callable, Dict, Iterable, Iterator, NamedTuple, Optional, Tuple, TypeVar
from .exceptions import AbortDownloadException, InvalidArgumentException, QueryReturnedBadRequestException
from .instaloadercontext import InstaloaderContext
@ -204,7 +204,7 @@ class NodeIterator(Iterator[T]):
@contextmanager
def resumable_iteration(context: InstaloaderContext,
iterator: Iterator,
iterator: Iterable,
load: Callable[[InstaloaderContext, str], Any],
save: Callable[[FrozenNodeIterator, str], None],
format_path: Callable[[str], str],

View File

@ -4,7 +4,7 @@ import re
from base64 import b64decode, b64encode
from collections import namedtuple
from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional, Union
from typing import Any, Dict, Iterable, Iterator, List, Optional, Union
from . import __version__
from .exceptions import *
@ -426,12 +426,15 @@ class Post:
except KeyError:
return self._field('edge_media_to_comment', 'count')
def get_comments(self) -> Iterator[PostComment]:
def get_comments(self) -> Iterable[PostComment]:
r"""Iterate over all comments of the post.
Each comment is represented by a PostComment namedtuple with fields text (string), created_at (datetime),
id (int), owner (:class:`Profile`) and answers (:class:`~typing.Iterator`\ [:class:`PostCommentAnswer`])
if available.
.. versionchanged:: 4.7
Change return type to ``Iterable``.
"""
def _postcommentanswer(node):
return PostCommentAnswer(id=int(node['id']),
@ -466,16 +469,15 @@ class Post:
answers=_postcommentanswers(node))
if self.comments == 0:
# Avoid doing additional requests if there are no comments
return
return []
comment_edges = self._field('edge_media_to_comment', 'edges')
answers_count = sum([edge['node'].get('edge_threaded_comments', {}).get('count', 0) for edge in comment_edges])
if self.comments == len(comment_edges) + answers_count:
# If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them
yield from (_postcomment(comment['node']) for comment in comment_edges)
return
yield from NodeIterator(
return [_postcomment(comment['node']) for comment in comment_edges]
return NodeIterator(
self._context,
'97b41c52301f77ce508f55e66d17620e',
lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'],