1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-11-04 17:32:30 +01:00

Support resuming of downloading comments

Co-Authored-By: André Koch-Kramer <koch-kramer@web.de>
This commit is contained in:
Alexander Graf 2021-03-13 11:17:56 +01:00
parent cd13211603
commit f6731566cd
3 changed files with 44 additions and 13 deletions

View File

@ -335,18 +335,47 @@ class Instaloader:
combined_answers.extend(y['answers']) combined_answers.extend(y['answers'])
unique_comments_list[-1]['answers'] = get_unique_comments(combined_answers) unique_comments_list[-1]['answers'] = get_unique_comments(combined_answers)
return unique_comments_list return unique_comments_list
def get_new_comments(new_comments, start):
for idx, comment in enumerate(new_comments, start=start+1):
if idx % 250 == 0:
self.context.log('{}'.format(idx), end='', flush=True)
yield comment
def save_comments(extended_comments):
unique_comments = get_unique_comments(extended_comments, combine_answers=True)
answer_ids = set(int(answer['id']) for comment in unique_comments for answer in comment.get('answers', []))
with open(filename, 'w') as file:
file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, unique_comments)),
indent=4))
base_filename = filename
filename += '_comments.json' filename += '_comments.json'
try: try:
with open(filename) as fp: with open(filename) as fp:
comments = json.load(fp) comments = json.load(fp)
except (FileNotFoundError, json.decoder.JSONDecodeError): except (FileNotFoundError, json.decoder.JSONDecodeError):
comments = list() comments = list()
comments.extend(_postcomment_asdict(comment) for comment in post.get_comments())
comments_iterator = post.get_comments()
try:
with resumable_iteration(
context=self.context,
iterator=comments_iterator,
load=load_structure_from_file,
save=save_structure_to_file,
format_path=lambda magic: "{}_{}_{}.json.xz".format(base_filename, self.resume_prefix, magic),
check_bbd=self.check_resume_bbd,
enabled=self.resume_prefix is not None
) as (_is_resuming, start_index):
comments.extend(_postcomment_asdict(comment)
for comment in get_new_comments(comments_iterator, start_index))
except (KeyboardInterrupt, AbortDownloadException):
if comments: if comments:
comments = get_unique_comments(comments, combine_answers=True) save_comments(comments)
answer_ids = set(int(answer['id']) for comment in comments for answer in comment.get('answers', [])) raise
with open(filename, 'w') as file: if comments:
file.write(json.dumps(list(filter(lambda t: int(t['id']) not in answer_ids, comments)), indent=4)) save_comments(comments)
self.context.log('comments', end=' ', flush=True) self.context.log('comments', end=' ', flush=True)
def save_caption(self, filename: str, mtime: datetime, caption: str) -> None: def save_caption(self, filename: str, mtime: datetime, caption: str) -> None:

View File

@ -5,7 +5,7 @@ import os
from contextlib import contextmanager from contextlib import contextmanager
from datetime import datetime, timedelta from datetime import datetime, timedelta
from lzma import LZMAError from lzma import LZMAError
from typing import Any, Callable, Dict, Iterator, NamedTuple, Optional, Tuple, TypeVar from typing import Any, Callable, Dict, Iterable, Iterator, NamedTuple, Optional, Tuple, TypeVar
from .exceptions import AbortDownloadException, InvalidArgumentException, QueryReturnedBadRequestException from .exceptions import AbortDownloadException, InvalidArgumentException, QueryReturnedBadRequestException
from .instaloadercontext import InstaloaderContext from .instaloadercontext import InstaloaderContext
@ -204,7 +204,7 @@ class NodeIterator(Iterator[T]):
@contextmanager @contextmanager
def resumable_iteration(context: InstaloaderContext, def resumable_iteration(context: InstaloaderContext,
iterator: Iterator, iterator: Iterable,
load: Callable[[InstaloaderContext, str], Any], load: Callable[[InstaloaderContext, str], Any],
save: Callable[[FrozenNodeIterator, str], None], save: Callable[[FrozenNodeIterator, str], None],
format_path: Callable[[str], str], format_path: Callable[[str], str],

View File

@ -4,7 +4,7 @@ import re
from base64 import b64decode, b64encode from base64 import b64decode, b64encode
from collections import namedtuple from collections import namedtuple
from datetime import datetime from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional, Union from typing import Any, Dict, Iterable, Iterator, List, Optional, Union
from . import __version__ from . import __version__
from .exceptions import * from .exceptions import *
@ -426,12 +426,15 @@ class Post:
except KeyError: except KeyError:
return self._field('edge_media_to_comment', 'count') return self._field('edge_media_to_comment', 'count')
def get_comments(self) -> Iterator[PostComment]: def get_comments(self) -> Iterable[PostComment]:
r"""Iterate over all comments of the post. r"""Iterate over all comments of the post.
Each comment is represented by a PostComment namedtuple with fields text (string), created_at (datetime), Each comment is represented by a PostComment namedtuple with fields text (string), created_at (datetime),
id (int), owner (:class:`Profile`) and answers (:class:`~typing.Iterator`\ [:class:`PostCommentAnswer`]) id (int), owner (:class:`Profile`) and answers (:class:`~typing.Iterator`\ [:class:`PostCommentAnswer`])
if available. if available.
.. versionchanged:: 4.7
Change return type to ``Iterable``.
""" """
def _postcommentanswer(node): def _postcommentanswer(node):
return PostCommentAnswer(id=int(node['id']), return PostCommentAnswer(id=int(node['id']),
@ -466,16 +469,15 @@ class Post:
answers=_postcommentanswers(node)) answers=_postcommentanswers(node))
if self.comments == 0: if self.comments == 0:
# Avoid doing additional requests if there are no comments # Avoid doing additional requests if there are no comments
return return []
comment_edges = self._field('edge_media_to_comment', 'edges') comment_edges = self._field('edge_media_to_comment', 'edges')
answers_count = sum([edge['node'].get('edge_threaded_comments', {}).get('count', 0) for edge in comment_edges]) answers_count = sum([edge['node'].get('edge_threaded_comments', {}).get('count', 0) for edge in comment_edges])
if self.comments == len(comment_edges) + answers_count: if self.comments == len(comment_edges) + answers_count:
# If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them # If the Post's metadata already contains all parent comments, don't do GraphQL requests to obtain them
yield from (_postcomment(comment['node']) for comment in comment_edges) return [_postcomment(comment['node']) for comment in comment_edges]
return return NodeIterator(
yield from NodeIterator(
self._context, self._context,
'97b41c52301f77ce508f55e66d17620e', '97b41c52301f77ce508f55e66d17620e',
lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'], lambda d: d['data']['shortcode_media']['edge_media_to_parent_comment'],