1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-08-18 12:49:38 +02:00

Name profilepic by hash if Last-Modified missing

This fixes #188.

Also, this commit prevents double-requesting the profile pic URL to obtain the
Last-Modified header.
This commit is contained in:
Alexander Graf 2019-01-03 16:49:21 +01:00
parent 06845b53fc
commit 4bc0a94e12
2 changed files with 42 additions and 20 deletions

View File

@ -11,6 +11,7 @@ import tempfile
from contextlib import contextmanager, suppress
from datetime import datetime, timezone
from functools import wraps
from hashlib import md5
from io import BytesIO
from typing import Any, Callable, Iterator, List, Optional, Set, Union
@ -289,26 +290,32 @@ class Instaloader:
"""Downloads and saves profile pic."""
def _epoch_to_string(epoch: datetime) -> str:
return epoch.strftime('%Y-%m-%d_%H-%M-%S')
return epoch.strftime('%Y-%m-%d_%H-%M-%S_UTC')
profile_pic_url = profile.profile_pic_url
with self.context.get_anonymous_session() as anonymous_session:
date_object = datetime.strptime(anonymous_session.head(profile_pic_url).headers["Last-Modified"],
'%a, %d %b %Y %H:%M:%S GMT')
profile_pic_response = self.context.get_raw(profile.profile_pic_url)
if 'Last-Modified' in profile_pic_response.headers:
date_object = datetime.strptime(profile_pic_response.headers["Last-Modified"], '%a, %d %b %Y %H:%M:%S GMT')
profile_pic_bytes = None
profile_pic_identifier = _epoch_to_string(date_object)
else:
date_object = None
profile_pic_bytes = profile_pic_response.content
profile_pic_identifier = md5(profile_pic_bytes).hexdigest()[:16]
profile_pic_extension = 'jpg'
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
format_string_contains_key(self.dirname_pattern, 'target'))):
filename = '{0}/{1}_UTC_profile_pic.{2}'.format(self.dirname_pattern.format(profile=profile.username.lower(),
target=profile.username.lower()),
_epoch_to_string(date_object), profile_pic_extension)
filename = '{0}/{1}_profile_pic.{2}'.format(self.dirname_pattern.format(profile=profile.username.lower(),
target=profile.username.lower()),
profile_pic_identifier, profile_pic_extension)
else:
filename = '{0}/{1}_{2}_UTC_profile_pic.{3}'.format(self.dirname_pattern.format(), profile.username.lower(),
_epoch_to_string(date_object), profile_pic_extension)
filename = '{0}/{1}_{2}_profile_pic.{3}'.format(self.dirname_pattern.format(), profile.username.lower(),
profile_pic_identifier, profile_pic_extension)
if os.path.isfile(filename):
self.context.log(filename + ' already exists')
return None
self.context.get_and_write_raw(profile_pic_url, filename)
os.utime(filename, (datetime.now().timestamp(), date_object.timestamp()))
self.context.write_raw(profile_pic_bytes if profile_pic_bytes else profile_pic_response, filename)
if date_object:
os.utime(filename, (datetime.now().timestamp(), date_object.timestamp()))
self.context.log('') # log output of _get_and_write_raw() does not produce \n
@_requires_login

View File

@ -10,7 +10,7 @@ import time
import urllib.parse
from contextlib import contextmanager
from datetime import datetime, timedelta
from typing import Any, Callable, Dict, Iterator, Optional
from typing import Any, Callable, Dict, Iterator, Optional, Union
import requests
import requests.utils
@ -429,8 +429,17 @@ class InstaloaderContext:
data = _query()
yield from (edge['node'] for edge in data['edges'])
def get_and_write_raw(self, url: str, filename: str, _attempt=1) -> None:
"""Downloads raw data.
def write_raw(self, resp: Union[bytes, requests.Response], filename: str) -> None:
"""Write raw response data into a file."""
self.log(filename, end=' ', flush=True)
with open(filename, 'wb') as file:
if isinstance(resp, requests.Response):
shutil.copyfileobj(resp.raw, file)
else:
file.write(resp)
def get_raw(self, url: str, _attempt=1) -> requests.Response:
"""Downloads a file anonymously.
:raises QueryReturnedNotFoundException: When the server responds with a 404.
:raises QueryReturnedForbiddenException: When the server responds with a 403.
@ -439,10 +448,8 @@ class InstaloaderContext:
with self.get_anonymous_session() as anonymous_session:
resp = anonymous_session.get(url, stream=True)
if resp.status_code == 200:
self.log(filename, end=' ', flush=True)
with open(filename, 'wb') as file:
resp.raw.decode_content = True
shutil.copyfileobj(resp.raw, file)
resp.raw.decode_content = True
return resp
else:
if resp.status_code == 403:
# suspected invalid URL signature
@ -458,11 +465,19 @@ class InstaloaderContext:
self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
try:
self._sleep()
self.get_and_write_raw(url, filename, _attempt + 1)
return self.get_raw(url, _attempt + 1)
except KeyboardInterrupt:
self.error("[skipped by user]", repeat_at_end=False)
raise ConnectionException(error_string) from err
def get_and_write_raw(self, url: str, filename: str) -> None:
"""Downloads and writes anonymously-requested raw data into a file.
:raises QueryReturnedNotFoundException: When the server responds with a 404.
:raises QueryReturnedForbiddenException: When the server responds with a 403.
:raises ConnectionException: When download repeatedly failed."""
self.write_raw(self.get_raw(url), filename)
@property
def root_rhx_gis(self) -> Optional[str]:
"""rhx_gis string returned in the / query."""