1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-09-11 16:22:24 +02:00

Wait smarter to avoid HTTP error code 429

Additional sleeps are necessary because Instagram is rate limiting
GraphQL queries. The error does not occur if not more than 100 queries
are made in a sliding window of eleven minutes.
Ports a894c2d to version 3.
This commit is contained in:
André Koch-Kramer 2017-08-24 18:30:46 +02:00
parent 2e47642f74
commit bb71c40b56

View File

@ -6,6 +6,7 @@ import getpass
import json import json
import os import os
import pickle import pickle
import random
import re import re
import shutil import shutil
import string import string
@ -356,6 +357,7 @@ class Instaloader:
self.download_geotags = download_geotags self.download_geotags = download_geotags
self.download_captions = download_captions self.download_captions = download_captions
self.download_comments = download_comments self.download_comments = download_comments
self.previous_queries = dict()
# error log, filled with error() and printed at the end of Instaloader.main() # error log, filled with error() and printed at the end of Instaloader.main()
self.error_log = [] self.error_log = []
@ -375,12 +377,10 @@ class Instaloader:
self.dirname_pattern, self.filename_pattern, self.dirname_pattern, self.filename_pattern,
self.download_videos, self.download_geotags, self.download_videos, self.download_geotags,
self.download_captions, self.download_comments) self.download_captions, self.download_comments)
new_loader.request_count = self.request_count new_loader.previous_queries = self.previous_queries
new_loader.last_request_time = self.last_request_time
yield new_loader yield new_loader
self.error_log.extend(new_loader.error_log) self.error_log.extend(new_loader.error_log)
self.request_count = new_loader.request_count self.previous_queries = new_loader.previous_queries
self.last_request_time = new_loader.last_request_time
def _log(self, *msg, sep='', end='\n', flush=False): def _log(self, *msg, sep='', end='\n', flush=False):
"""Log a message to stdout that can be suppressed with --quiet.""" """Log a message to stdout that can be suppressed with --quiet."""
@ -408,17 +408,8 @@ class Instaloader:
def _sleep(self): def _sleep(self):
"""Sleep a short time if self.sleep is set. Called before each request to instagram.com.""" """Sleep a short time if self.sleep is set. Called before each request to instagram.com."""
if not self.sleep: if self.sleep:
return time.sleep(random.uniform(0.5, 3))
max_sleep_int = 600/50 # 50 requests per 10 minutes
count_for_max_sleep = 80 # after 80 requests.
sleep_interval = min(self.request_count, count_for_max_sleep) / count_for_max_sleep * max_sleep_int
current_time = time.monotonic()
sleep_time = self.last_request_time + sleep_interval - current_time
if sleep_time > 0.0:
time.sleep(sleep_time)
self.request_count += 1
self.last_request_time = max(current_time, self.last_request_time + sleep_interval)
def _get_and_write_raw(self, url: str, filename: str, tries: int = 3) -> None: def _get_and_write_raw(self, url: str, filename: str, tries: int = 3) -> None:
"""Downloads raw data. """Downloads raw data.
@ -446,18 +437,41 @@ class Instaloader:
self._sleep() self._sleep()
self._get_and_write_raw(url, filename, tries - 1) self._get_and_write_raw(url, filename, tries - 1)
def get_json(self, url: str, params: Optional[Dict[str, Any]] = None, def get_json(self, url: str, params: Dict[str, Any],
session: Optional[requests.Session] = None, tries: int = 3) -> Dict[str, Any]: session: Optional[requests.Session] = None, tries: int = 3) -> Dict[str, Any]:
"""JSON request to Instagram. """JSON request to Instagram.
:param url: URL, relative to https://www.instagram.com/ :param url: URL, relative to https://www.instagram.com/
:param params: GET parameters :param params: GET parameters
:param session: Session to use, or None to use self.session :param session: Session to use, or None to use self.session
:param tries: Maximum number of attempts until a exception is raised :param tries: Maximum number of attempts until an exception is raised
:return: Decoded response dictionary :return: Decoded response dictionary
:raises QueryReturnedNotFoundException: When the server responds with a 404. :raises QueryReturnedNotFoundException: When the server responds with a 404.
:raises ConnectionException: When query repeatedly failed. :raises ConnectionException: When query repeatedly failed.
""" """
def graphql_query_waittime(query_id: int, untracked_queries: bool = False) -> int:
sliding_window = 660
timestamps = self.previous_queries.get(query_id)
if not timestamps:
return sliding_window if untracked_queries else 0
current_time = time.monotonic()
timestamps = list(filter(lambda t: t > current_time - sliding_window, timestamps))
self.previous_queries[query_id] = timestamps
if len(timestamps) < 100 and not untracked_queries:
return 0
return round(min(timestamps) + sliding_window - current_time) + 6
is_graphql_query = 'query_id' in params and 'graphql/query' in url
if is_graphql_query:
query_id = params['query_id']
waittime = graphql_query_waittime(query_id)
if waittime > 0:
self._log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime))
time.sleep(waittime)
timestamp_list = self.previous_queries.get(query_id)
if timestamp_list is not None:
timestamp_list.append(time.monotonic())
else:
self.previous_queries[query_id] = [time.monotonic()]
sess = session if session else self.session sess = session if session else self.session
try: try:
self._sleep() self._sleep()
@ -476,7 +490,7 @@ class Instaloader:
else: else:
raise ConnectionException("Returned \"{}\" status.".format(resp_json['status'])) raise ConnectionException("Returned \"{}\" status.".format(resp_json['status']))
return resp_json return resp_json
except (ConnectionException, json.decoder.JSONDecodeError) as err: except (ConnectionException, json.decoder.JSONDecodeError, requests.exceptions.RequestException) as err:
error_string = "JSON Query to {}: {}".format(url, err) error_string = "JSON Query to {}: {}".format(url, err)
if tries <= 1: if tries <= 1:
raise ConnectionException(error_string) raise ConnectionException(error_string)
@ -484,9 +498,13 @@ class Instaloader:
if isinstance(err, TooManyRequests): if isinstance(err, TooManyRequests):
text_for_429 = ("HTTP error code 429 was returned because too many queries occured in the last time. " text_for_429 = ("HTTP error code 429 was returned because too many queries occured in the last time. "
"Please do not use Instagram in your browser or run multiple instances of Instaloader " "Please do not use Instagram in your browser or run multiple instances of Instaloader "
"in parallel. The request is retried in about four minutes.") "in parallel.")
print(textwrap.fill(text_for_429), file=sys.stderr) print(textwrap.fill(text_for_429), file=sys.stderr)
time.sleep(660/3) if is_graphql_query:
waittime = graphql_query_waittime(query_id=params['query_id'], untracked_queries=True)
if waittime > 0:
self._log('The request will be retried in {} seconds.'.format(waittime))
time.sleep(waittime)
self._sleep() self._sleep()
self.get_json(url, params, sess, tries - 1) self.get_json(url, params, sess, tries - 1)