From bb71c40b568445c5cd2c17875636b9aa7abd2dd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Koch-Kramer?= Date: Thu, 24 Aug 2017 18:30:46 +0200 Subject: [PATCH] Wait smarter to avoid HTTP error code 429 Additional sleeps are necessary because Instagram is rate limiting GraphQL queries. The error does not occur if not more than 100 queries are made in a sliding window of eleven minutes. Ports a894c2d to version 3. --- instaloader.py | 58 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/instaloader.py b/instaloader.py index 3b42949..352895b 100755 --- a/instaloader.py +++ b/instaloader.py @@ -6,6 +6,7 @@ import getpass import json import os import pickle +import random import re import shutil import string @@ -356,6 +357,7 @@ class Instaloader: self.download_geotags = download_geotags self.download_captions = download_captions self.download_comments = download_comments + self.previous_queries = dict() # error log, filled with error() and printed at the end of Instaloader.main() self.error_log = [] @@ -375,12 +377,10 @@ class Instaloader: self.dirname_pattern, self.filename_pattern, self.download_videos, self.download_geotags, self.download_captions, self.download_comments) - new_loader.request_count = self.request_count - new_loader.last_request_time = self.last_request_time + new_loader.previous_queries = self.previous_queries yield new_loader self.error_log.extend(new_loader.error_log) - self.request_count = new_loader.request_count - self.last_request_time = new_loader.last_request_time + self.previous_queries = new_loader.previous_queries def _log(self, *msg, sep='', end='\n', flush=False): """Log a message to stdout that can be suppressed with --quiet.""" @@ -408,17 +408,8 @@ class Instaloader: def _sleep(self): """Sleep a short time if self.sleep is set. Called before each request to instagram.com.""" - if not self.sleep: - return - max_sleep_int = 600/50 # 50 requests per 10 minutes - count_for_max_sleep = 80 # after 80 requests. - sleep_interval = min(self.request_count, count_for_max_sleep) / count_for_max_sleep * max_sleep_int - current_time = time.monotonic() - sleep_time = self.last_request_time + sleep_interval - current_time - if sleep_time > 0.0: - time.sleep(sleep_time) - self.request_count += 1 - self.last_request_time = max(current_time, self.last_request_time + sleep_interval) + if self.sleep: + time.sleep(random.uniform(0.5, 3)) def _get_and_write_raw(self, url: str, filename: str, tries: int = 3) -> None: """Downloads raw data. @@ -446,18 +437,41 @@ class Instaloader: self._sleep() self._get_and_write_raw(url, filename, tries - 1) - def get_json(self, url: str, params: Optional[Dict[str, Any]] = None, + def get_json(self, url: str, params: Dict[str, Any], session: Optional[requests.Session] = None, tries: int = 3) -> Dict[str, Any]: """JSON request to Instagram. :param url: URL, relative to https://www.instagram.com/ :param params: GET parameters :param session: Session to use, or None to use self.session - :param tries: Maximum number of attempts until a exception is raised + :param tries: Maximum number of attempts until an exception is raised :return: Decoded response dictionary :raises QueryReturnedNotFoundException: When the server responds with a 404. :raises ConnectionException: When query repeatedly failed. """ + def graphql_query_waittime(query_id: int, untracked_queries: bool = False) -> int: + sliding_window = 660 + timestamps = self.previous_queries.get(query_id) + if not timestamps: + return sliding_window if untracked_queries else 0 + current_time = time.monotonic() + timestamps = list(filter(lambda t: t > current_time - sliding_window, timestamps)) + self.previous_queries[query_id] = timestamps + if len(timestamps) < 100 and not untracked_queries: + return 0 + return round(min(timestamps) + sliding_window - current_time) + 6 + is_graphql_query = 'query_id' in params and 'graphql/query' in url + if is_graphql_query: + query_id = params['query_id'] + waittime = graphql_query_waittime(query_id) + if waittime > 0: + self._log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime)) + time.sleep(waittime) + timestamp_list = self.previous_queries.get(query_id) + if timestamp_list is not None: + timestamp_list.append(time.monotonic()) + else: + self.previous_queries[query_id] = [time.monotonic()] sess = session if session else self.session try: self._sleep() @@ -476,7 +490,7 @@ class Instaloader: else: raise ConnectionException("Returned \"{}\" status.".format(resp_json['status'])) return resp_json - except (ConnectionException, json.decoder.JSONDecodeError) as err: + except (ConnectionException, json.decoder.JSONDecodeError, requests.exceptions.RequestException) as err: error_string = "JSON Query to {}: {}".format(url, err) if tries <= 1: raise ConnectionException(error_string) @@ -484,9 +498,13 @@ class Instaloader: if isinstance(err, TooManyRequests): text_for_429 = ("HTTP error code 429 was returned because too many queries occured in the last time. " "Please do not use Instagram in your browser or run multiple instances of Instaloader " - "in parallel. The request is retried in about four minutes.") + "in parallel.") print(textwrap.fill(text_for_429), file=sys.stderr) - time.sleep(660/3) + if is_graphql_query: + waittime = graphql_query_waittime(query_id=params['query_id'], untracked_queries=True) + if waittime > 0: + self._log('The request will be retried in {} seconds.'.format(waittime)) + time.sleep(waittime) self._sleep() self.get_json(url, params, sess, tries - 1)