1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-08-17 12:19:38 +02:00

Wait smarter to avoid HTTP error code 429

Additional sleeps are necessary because Instagram is rate limiting
GraphQL queries. The error does not occur if not more than 100 queries
are made in a sliding window of eleven minutes.
Ports a894c2d to version 3.
This commit is contained in:
André Koch-Kramer 2017-08-24 18:30:46 +02:00
parent 2e47642f74
commit bb71c40b56

View File

@ -6,6 +6,7 @@ import getpass
import json
import os
import pickle
import random
import re
import shutil
import string
@ -356,6 +357,7 @@ class Instaloader:
self.download_geotags = download_geotags
self.download_captions = download_captions
self.download_comments = download_comments
self.previous_queries = dict()
# error log, filled with error() and printed at the end of Instaloader.main()
self.error_log = []
@ -375,12 +377,10 @@ class Instaloader:
self.dirname_pattern, self.filename_pattern,
self.download_videos, self.download_geotags,
self.download_captions, self.download_comments)
new_loader.request_count = self.request_count
new_loader.last_request_time = self.last_request_time
new_loader.previous_queries = self.previous_queries
yield new_loader
self.error_log.extend(new_loader.error_log)
self.request_count = new_loader.request_count
self.last_request_time = new_loader.last_request_time
self.previous_queries = new_loader.previous_queries
def _log(self, *msg, sep='', end='\n', flush=False):
"""Log a message to stdout that can be suppressed with --quiet."""
@ -408,17 +408,8 @@ class Instaloader:
def _sleep(self):
"""Sleep a short time if self.sleep is set. Called before each request to instagram.com."""
if not self.sleep:
return
max_sleep_int = 600/50 # 50 requests per 10 minutes
count_for_max_sleep = 80 # after 80 requests.
sleep_interval = min(self.request_count, count_for_max_sleep) / count_for_max_sleep * max_sleep_int
current_time = time.monotonic()
sleep_time = self.last_request_time + sleep_interval - current_time
if sleep_time > 0.0:
time.sleep(sleep_time)
self.request_count += 1
self.last_request_time = max(current_time, self.last_request_time + sleep_interval)
if self.sleep:
time.sleep(random.uniform(0.5, 3))
def _get_and_write_raw(self, url: str, filename: str, tries: int = 3) -> None:
"""Downloads raw data.
@ -446,18 +437,41 @@ class Instaloader:
self._sleep()
self._get_and_write_raw(url, filename, tries - 1)
def get_json(self, url: str, params: Optional[Dict[str, Any]] = None,
def get_json(self, url: str, params: Dict[str, Any],
session: Optional[requests.Session] = None, tries: int = 3) -> Dict[str, Any]:
"""JSON request to Instagram.
:param url: URL, relative to https://www.instagram.com/
:param params: GET parameters
:param session: Session to use, or None to use self.session
:param tries: Maximum number of attempts until a exception is raised
:param tries: Maximum number of attempts until an exception is raised
:return: Decoded response dictionary
:raises QueryReturnedNotFoundException: When the server responds with a 404.
:raises ConnectionException: When query repeatedly failed.
"""
def graphql_query_waittime(query_id: int, untracked_queries: bool = False) -> int:
sliding_window = 660
timestamps = self.previous_queries.get(query_id)
if not timestamps:
return sliding_window if untracked_queries else 0
current_time = time.monotonic()
timestamps = list(filter(lambda t: t > current_time - sliding_window, timestamps))
self.previous_queries[query_id] = timestamps
if len(timestamps) < 100 and not untracked_queries:
return 0
return round(min(timestamps) + sliding_window - current_time) + 6
is_graphql_query = 'query_id' in params and 'graphql/query' in url
if is_graphql_query:
query_id = params['query_id']
waittime = graphql_query_waittime(query_id)
if waittime > 0:
self._log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime))
time.sleep(waittime)
timestamp_list = self.previous_queries.get(query_id)
if timestamp_list is not None:
timestamp_list.append(time.monotonic())
else:
self.previous_queries[query_id] = [time.monotonic()]
sess = session if session else self.session
try:
self._sleep()
@ -476,7 +490,7 @@ class Instaloader:
else:
raise ConnectionException("Returned \"{}\" status.".format(resp_json['status']))
return resp_json
except (ConnectionException, json.decoder.JSONDecodeError) as err:
except (ConnectionException, json.decoder.JSONDecodeError, requests.exceptions.RequestException) as err:
error_string = "JSON Query to {}: {}".format(url, err)
if tries <= 1:
raise ConnectionException(error_string)
@ -484,9 +498,13 @@ class Instaloader:
if isinstance(err, TooManyRequests):
text_for_429 = ("HTTP error code 429 was returned because too many queries occured in the last time. "
"Please do not use Instagram in your browser or run multiple instances of Instaloader "
"in parallel. The request is retried in about four minutes.")
"in parallel.")
print(textwrap.fill(text_for_429), file=sys.stderr)
time.sleep(660/3)
if is_graphql_query:
waittime = graphql_query_waittime(query_id=params['query_id'], untracked_queries=True)
if waittime > 0:
self._log('The request will be retried in {} seconds.'.format(waittime))
time.sleep(waittime)
self._sleep()
self.get_json(url, params, sess, tries - 1)