From 4bc0a94e12834b35255874da034cfe37a552be58 Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Thu, 3 Jan 2019 16:49:21 +0100 Subject: [PATCH] Name profilepic by hash if Last-Modified missing This fixes #188. Also, this commit prevents double-requesting the profile pic URL to obtain the Last-Modified header. --- instaloader/instaloader.py | 31 +++++++++++++++++++------------ instaloader/instaloadercontext.py | 31 +++++++++++++++++++++++-------- 2 files changed, 42 insertions(+), 20 deletions(-) diff --git a/instaloader/instaloader.py b/instaloader/instaloader.py index c43a1d0..ad88749 100644 --- a/instaloader/instaloader.py +++ b/instaloader/instaloader.py @@ -11,6 +11,7 @@ import tempfile from contextlib import contextmanager, suppress from datetime import datetime, timezone from functools import wraps +from hashlib import md5 from io import BytesIO from typing import Any, Callable, Iterator, List, Optional, Set, Union @@ -289,26 +290,32 @@ class Instaloader: """Downloads and saves profile pic.""" def _epoch_to_string(epoch: datetime) -> str: - return epoch.strftime('%Y-%m-%d_%H-%M-%S') + return epoch.strftime('%Y-%m-%d_%H-%M-%S_UTC') - profile_pic_url = profile.profile_pic_url - with self.context.get_anonymous_session() as anonymous_session: - date_object = datetime.strptime(anonymous_session.head(profile_pic_url).headers["Last-Modified"], - '%a, %d %b %Y %H:%M:%S GMT') + profile_pic_response = self.context.get_raw(profile.profile_pic_url) + if 'Last-Modified' in profile_pic_response.headers: + date_object = datetime.strptime(profile_pic_response.headers["Last-Modified"], '%a, %d %b %Y %H:%M:%S GMT') + profile_pic_bytes = None + profile_pic_identifier = _epoch_to_string(date_object) + else: + date_object = None + profile_pic_bytes = profile_pic_response.content + profile_pic_identifier = md5(profile_pic_bytes).hexdigest()[:16] profile_pic_extension = 'jpg' if ((format_string_contains_key(self.dirname_pattern, 'profile') or format_string_contains_key(self.dirname_pattern, 'target'))): - filename = '{0}/{1}_UTC_profile_pic.{2}'.format(self.dirname_pattern.format(profile=profile.username.lower(), - target=profile.username.lower()), - _epoch_to_string(date_object), profile_pic_extension) + filename = '{0}/{1}_profile_pic.{2}'.format(self.dirname_pattern.format(profile=profile.username.lower(), + target=profile.username.lower()), + profile_pic_identifier, profile_pic_extension) else: - filename = '{0}/{1}_{2}_UTC_profile_pic.{3}'.format(self.dirname_pattern.format(), profile.username.lower(), - _epoch_to_string(date_object), profile_pic_extension) + filename = '{0}/{1}_{2}_profile_pic.{3}'.format(self.dirname_pattern.format(), profile.username.lower(), + profile_pic_identifier, profile_pic_extension) if os.path.isfile(filename): self.context.log(filename + ' already exists') return None - self.context.get_and_write_raw(profile_pic_url, filename) - os.utime(filename, (datetime.now().timestamp(), date_object.timestamp())) + self.context.write_raw(profile_pic_bytes if profile_pic_bytes else profile_pic_response, filename) + if date_object: + os.utime(filename, (datetime.now().timestamp(), date_object.timestamp())) self.context.log('') # log output of _get_and_write_raw() does not produce \n @_requires_login diff --git a/instaloader/instaloadercontext.py b/instaloader/instaloadercontext.py index bed073d..fb7b42f 100644 --- a/instaloader/instaloadercontext.py +++ b/instaloader/instaloadercontext.py @@ -10,7 +10,7 @@ import time import urllib.parse from contextlib import contextmanager from datetime import datetime, timedelta -from typing import Any, Callable, Dict, Iterator, Optional +from typing import Any, Callable, Dict, Iterator, Optional, Union import requests import requests.utils @@ -429,8 +429,17 @@ class InstaloaderContext: data = _query() yield from (edge['node'] for edge in data['edges']) - def get_and_write_raw(self, url: str, filename: str, _attempt=1) -> None: - """Downloads raw data. + def write_raw(self, resp: Union[bytes, requests.Response], filename: str) -> None: + """Write raw response data into a file.""" + self.log(filename, end=' ', flush=True) + with open(filename, 'wb') as file: + if isinstance(resp, requests.Response): + shutil.copyfileobj(resp.raw, file) + else: + file.write(resp) + + def get_raw(self, url: str, _attempt=1) -> requests.Response: + """Downloads a file anonymously. :raises QueryReturnedNotFoundException: When the server responds with a 404. :raises QueryReturnedForbiddenException: When the server responds with a 403. @@ -439,10 +448,8 @@ class InstaloaderContext: with self.get_anonymous_session() as anonymous_session: resp = anonymous_session.get(url, stream=True) if resp.status_code == 200: - self.log(filename, end=' ', flush=True) - with open(filename, 'wb') as file: - resp.raw.decode_content = True - shutil.copyfileobj(resp.raw, file) + resp.raw.decode_content = True + return resp else: if resp.status_code == 403: # suspected invalid URL signature @@ -458,11 +465,19 @@ class InstaloaderContext: self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False) try: self._sleep() - self.get_and_write_raw(url, filename, _attempt + 1) + return self.get_raw(url, _attempt + 1) except KeyboardInterrupt: self.error("[skipped by user]", repeat_at_end=False) raise ConnectionException(error_string) from err + def get_and_write_raw(self, url: str, filename: str) -> None: + """Downloads and writes anonymously-requested raw data into a file. + + :raises QueryReturnedNotFoundException: When the server responds with a 404. + :raises QueryReturnedForbiddenException: When the server responds with a 403. + :raises ConnectionException: When download repeatedly failed.""" + self.write_raw(self.get_raw(url), filename) + @property def root_rhx_gis(self) -> Optional[str]: """rhx_gis string returned in the / query."""