2016-06-15 12:42:08 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2016-09-18 16:41:43 +02:00
|
|
|
"""Tool to download pictures (or videos) and captions from Instagram, from a given set
|
2016-09-19 19:26:59 +02:00
|
|
|
of profiles (even if private), from your feed or from all followees of a given profile."""
|
2016-09-18 14:38:58 +02:00
|
|
|
|
2016-07-26 17:36:21 +02:00
|
|
|
import re, json, datetime, shutil, os, time, random, sys, pickle, getpass, tempfile
|
2016-06-26 09:43:02 +02:00
|
|
|
from argparse import ArgumentParser
|
2016-08-01 18:10:35 +02:00
|
|
|
from io import BytesIO
|
2016-12-22 13:20:41 +01:00
|
|
|
from numbers import Real
|
|
|
|
from typing import List, Optional, Any, Dict, Callable
|
|
|
|
|
2016-08-02 18:41:57 +02:00
|
|
|
import requests, requests.utils
|
2016-08-01 18:10:35 +02:00
|
|
|
|
2016-09-18 14:38:58 +02:00
|
|
|
# To get version from setup.py for instaloader --version
|
|
|
|
import pkg_resources
|
|
|
|
try:
|
|
|
|
# pylint:disable=no-member
|
|
|
|
__version__ = pkg_resources.get_distribution('instaloader').version
|
|
|
|
except pkg_resources.DistributionNotFound:
|
|
|
|
__version__ = 'Run ./setup.py --version'
|
|
|
|
|
2016-08-01 18:10:35 +02:00
|
|
|
try:
|
2016-09-18 14:38:58 +02:00
|
|
|
# pylint:disable=wrong-import-position
|
2016-08-01 18:10:35 +02:00
|
|
|
import win_unicode_console
|
|
|
|
except ImportError:
|
|
|
|
WINUNICODE = False
|
|
|
|
else:
|
|
|
|
win_unicode_console.enable()
|
|
|
|
WINUNICODE = True
|
2016-06-15 12:42:08 +02:00
|
|
|
|
2016-08-18 09:58:07 +02:00
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class InstaloaderException(Exception):
|
|
|
|
"""Base exception for this script"""
|
2016-06-17 21:38:21 +02:00
|
|
|
pass
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class NonfatalException(InstaloaderException):
|
|
|
|
"""Base exception for errors which should not cause instaloader to stop"""
|
2016-07-12 21:04:40 +02:00
|
|
|
pass
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class ProfileNotExistsException(NonfatalException):
|
2016-07-12 21:04:40 +02:00
|
|
|
pass
|
|
|
|
|
2017-04-10 21:05:58 +02:00
|
|
|
class ProfileAccessDeniedException(NonfatalException):
|
|
|
|
pass
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class ProfileHasNoPicsException(NonfatalException):
|
2016-07-22 15:49:20 +02:00
|
|
|
pass
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class PrivateProfileNotFollowedException(NonfatalException):
|
2016-07-15 15:54:35 +02:00
|
|
|
pass
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class LoginRequiredException(NonfatalException):
|
2016-07-25 20:19:07 +02:00
|
|
|
pass
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class BadCredentialsException(InstaloaderException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
class ConnectionException(InstaloaderException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
def _log(*msg, sep='', end='\n', flush=False, quiet=False):
|
2016-06-26 10:39:26 +02:00
|
|
|
if not quiet:
|
|
|
|
print(*msg, sep=sep, end=end, flush=flush)
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2017-04-17 12:10:43 +02:00
|
|
|
def get_json(name: str, session: requests.Session,
|
2017-04-20 16:47:31 +02:00
|
|
|
max_id: Optional[str] = None, sleep: bool = True) -> Optional[Dict[str, Any]]:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Return JSON of a profile"""
|
2017-04-17 12:10:43 +02:00
|
|
|
if not max_id:
|
2017-02-13 09:50:20 +01:00
|
|
|
resp = session.get('https://www.instagram.com/'+name)
|
|
|
|
else:
|
|
|
|
resp = session.get('https://www.instagram.com/'+name, params={'max_id': max_id})
|
2016-08-03 20:25:16 +02:00
|
|
|
if sleep:
|
|
|
|
time.sleep(4 * random.random() + 1)
|
2016-08-03 20:29:36 +02:00
|
|
|
match = re.search('window\\._sharedData = .*<', resp.text)
|
2017-04-20 16:47:31 +02:00
|
|
|
if match is not None:
|
2016-06-27 16:49:00 +02:00
|
|
|
return json.loads(match.group(0)[21:-2])
|
2016-06-15 12:42:08 +02:00
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_username_by_id(session: requests.Session, profile_id: int) -> str:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""To get the current username of a profile, given its unique ID, this function can be used.
|
|
|
|
session is required to be a logged-in (i.e. non-anonymous) session."""
|
2016-07-28 18:15:36 +02:00
|
|
|
tempsession = copy_session(session)
|
2017-04-21 18:01:20 +02:00
|
|
|
tempsession.headers.update({'Content-Type' : 'application/x-www-form-urlencoded'})
|
2016-07-28 18:15:36 +02:00
|
|
|
resp = tempsession.post('https://www.instagram.com/query/', data='q=ig_user(' +
|
|
|
|
str(profile_id) +')+%7B%0A++username%0A%7D%0A')
|
|
|
|
if resp.status_code == 200:
|
|
|
|
data = json.loads(resp.text)
|
|
|
|
if 'username' in data:
|
|
|
|
return json.loads(resp.text)['username']
|
2016-07-29 18:03:19 +02:00
|
|
|
raise ProfileNotExistsException("No profile found, the user may have blocked " +
|
|
|
|
"you (id: " + str(profile_id) + ").")
|
2016-07-28 18:15:36 +02:00
|
|
|
else:
|
|
|
|
if test_login(session):
|
2017-04-10 21:05:58 +02:00
|
|
|
raise ProfileAccessDeniedException("Username could not be determined due to error {0} (id: {1})."
|
|
|
|
.format(str(resp.status_code), str(profile_id)))
|
2016-07-28 18:15:36 +02:00
|
|
|
raise LoginRequiredException("Login required to determine username (id: " +
|
2016-07-29 18:03:19 +02:00
|
|
|
str(profile_id) + ").")
|
2016-07-28 18:15:36 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_id_by_username(profile: str) -> int:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Each Instagram profile has its own unique ID which stays unmodified even if a user changes
|
|
|
|
his/her username. To get said ID, given the profile's name, you may call this function."""
|
2016-08-02 21:27:39 +02:00
|
|
|
data = get_json(profile, get_anonymous_session())
|
2017-04-20 16:47:31 +02:00
|
|
|
if "ProfilePage" not in data["entry_data"]:
|
2016-08-02 21:27:39 +02:00
|
|
|
raise ProfileNotExistsException("Profile {0} does not exist.".format(profile))
|
|
|
|
return int(data['entry_data']['ProfilePage'][0]['user']['id'])
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def _epoch_to_string(epoch: Real) -> str:
|
2016-06-15 12:42:08 +02:00
|
|
|
return datetime.datetime.fromtimestamp(epoch).strftime('%Y-%m-%d_%H-%M-%S')
|
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_followees(profile: str, session: requests.Session) -> List[Dict[str, Any]]:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""
|
|
|
|
Retrieve list of followees of given profile
|
|
|
|
|
|
|
|
:param profile: Name of profile to lookup followees
|
|
|
|
:param session: Session belonging to a user, i.e. not an anonymous session
|
|
|
|
:return: List of followees (list of dictionaries), as returned by instagram server
|
|
|
|
"""
|
2016-07-28 15:45:31 +02:00
|
|
|
tmpsession = copy_session(session)
|
2016-07-28 17:44:02 +02:00
|
|
|
data = get_json(profile, tmpsession)
|
2016-07-28 15:45:31 +02:00
|
|
|
profile_id = data['entry_data']['ProfilePage'][0]['user']['id']
|
|
|
|
query = ["q=ig_user(" + profile_id + ")+%7B%0A"
|
|
|
|
"++follows.",
|
|
|
|
str(data['entry_data']['ProfilePage'][0]['user']['follows']['count']) +
|
|
|
|
")+%7B%0A"
|
|
|
|
"++++count%2C%0A"
|
|
|
|
"++++page_info+%7B%0A"
|
|
|
|
"++++++end_cursor%2C%0A"
|
|
|
|
"++++++has_next_page%0A"
|
|
|
|
"++++%7D%2C%0A"
|
|
|
|
"++++nodes+%7B%0A"
|
|
|
|
"++++++id%2C%0A"
|
|
|
|
"++++++full_name%2C%0A"
|
|
|
|
"++++++username%2C%0A"
|
|
|
|
"++++++followed_by+%7B%0A"
|
|
|
|
"++++++++count%0A"
|
|
|
|
"++++++%7D%0A"
|
|
|
|
"++++%7D%0A"
|
|
|
|
"++%7D%0A"
|
|
|
|
"%7D%0A"
|
|
|
|
"&ref=relationships%3A%3Afollow_list"]
|
|
|
|
tmpsession.headers.update(default_http_header())
|
2016-07-28 17:44:02 +02:00
|
|
|
tmpsession.headers.update({'Referer' : 'https://www.instagram.com/'+profile+'/following/'})
|
2017-04-21 18:01:20 +02:00
|
|
|
tmpsession.headers.update({'Content-Type' : 'application/x-www-form-urlencoded'})
|
2016-07-28 15:45:31 +02:00
|
|
|
resp = tmpsession.post('https://www.instagram.com/query/', data=query[0]+"first("+query[1])
|
|
|
|
if resp.status_code == 200:
|
|
|
|
data = json.loads(resp.text)
|
|
|
|
followees = []
|
|
|
|
while True:
|
|
|
|
for followee in data['follows']['nodes']:
|
|
|
|
followee['follower_count'] = followee.pop('followed_by')['count']
|
|
|
|
followees = followees + [followee]
|
|
|
|
if data['follows']['page_info']['has_next_page']:
|
|
|
|
resp = tmpsession.post('https://www.instagram.com/query/', data=query[0]
|
|
|
|
+ "after("
|
|
|
|
+ data['follows']['page_info']['end_cursor']
|
|
|
|
+ "%2C+" + query[1] )
|
|
|
|
data = json.loads(resp.text)
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
return followees
|
2016-07-28 17:44:02 +02:00
|
|
|
if test_login(tmpsession):
|
2016-07-28 15:45:31 +02:00
|
|
|
raise ConnectionException("ConnectionError("+str(resp.status_code)+"): "
|
2016-07-29 18:03:19 +02:00
|
|
|
"unable to gather followees.")
|
|
|
|
raise LoginRequiredException("Login required to gather followees.")
|
2016-07-28 15:45:31 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2017-03-25 21:08:54 +01:00
|
|
|
def download_pic(name: str, url: str, date_epoch: Real, outputlabel: Optional[str] = None, quiet: bool = False,
|
|
|
|
filename_suffix: Optional[str] = None) -> bool:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Downloads and saves picture with given url under given directory with given timestamp.
|
|
|
|
Returns true, if file was actually downloaded, i.e. updated."""
|
2016-06-15 12:42:08 +02:00
|
|
|
if outputlabel is None:
|
2016-09-18 16:35:25 +02:00
|
|
|
outputlabel = _epoch_to_string(date_epoch)
|
|
|
|
urlmatch = re.search('\\.[a-z]*\\?', url)
|
|
|
|
file_extension = url[-3:] if urlmatch is None else urlmatch.group(0)[1:-1]
|
2017-03-25 21:08:54 +01:00
|
|
|
filename = name.lower() + '/' + _epoch_to_string(date_epoch)
|
|
|
|
if filename_suffix is not None:
|
|
|
|
filename += '_' + filename_suffix
|
|
|
|
filename += '.' + file_extension
|
2016-06-15 12:42:08 +02:00
|
|
|
if os.path.isfile(filename):
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(outputlabel + ' exists', end=' ', flush=True, quiet=quiet)
|
2016-06-17 21:40:55 +02:00
|
|
|
return False
|
2016-07-25 23:43:41 +02:00
|
|
|
resp = get_anonymous_session().get(url, stream=True)
|
2016-06-27 16:49:00 +02:00
|
|
|
if resp.status_code == 200:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(outputlabel, end=' ', flush=True, quiet=quiet)
|
2016-06-15 12:42:08 +02:00
|
|
|
os.makedirs(name.lower(), exist_ok=True)
|
2016-06-27 16:49:00 +02:00
|
|
|
with open(filename, 'wb') as file:
|
|
|
|
resp.raw.decode_content = True
|
|
|
|
shutil.copyfileobj(resp.raw, file)
|
2016-06-15 12:42:08 +02:00
|
|
|
os.utime(filename, (datetime.datetime.now().timestamp(), date_epoch))
|
2016-06-17 21:40:55 +02:00
|
|
|
return True
|
2016-06-15 12:42:08 +02:00
|
|
|
else:
|
2016-07-29 18:03:19 +02:00
|
|
|
raise ConnectionException("File \'" + url + "\' could not be downloaded.")
|
2016-06-15 12:42:08 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def save_caption(name: str, date_epoch: Real, caption: str, shorter_output: bool = False, quiet: bool = False) -> None:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Updates picture caption"""
|
|
|
|
filename = name.lower() + '/' + _epoch_to_string(date_epoch) + '.txt'
|
2016-07-29 16:53:18 +02:00
|
|
|
pcaption = caption.replace('\n', ' ').strip()
|
2016-08-01 18:10:35 +02:00
|
|
|
caption = caption.encode("UTF-8")
|
2016-08-04 19:36:36 +02:00
|
|
|
if shorter_output:
|
|
|
|
pcaption = "txt"
|
2016-08-01 18:10:35 +02:00
|
|
|
else:
|
2016-08-04 19:36:36 +02:00
|
|
|
pcaption = '[' + ((pcaption[:29]+u"\u2026") if len(pcaption)>31 else pcaption) + ']'
|
2016-08-03 13:51:25 +02:00
|
|
|
try:
|
2016-08-01 18:10:35 +02:00
|
|
|
with open(filename, 'rb') as file:
|
2016-06-27 16:49:00 +02:00
|
|
|
file_caption = file.read()
|
2016-08-01 18:10:35 +02:00
|
|
|
if file_caption.replace(b'\r\n', b'\n') == caption.replace(b'\r\n', b'\n'):
|
2016-08-04 19:36:36 +02:00
|
|
|
try:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(pcaption + ' unchanged', end=' ', flush=True, quiet=quiet)
|
2016-08-04 19:36:36 +02:00
|
|
|
except UnicodeEncodeError:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log('txt unchanged', end=' ', flush=True, quiet=quiet)
|
2016-06-15 12:42:08 +02:00
|
|
|
return None
|
|
|
|
else:
|
|
|
|
def get_filename(index):
|
2016-09-22 18:28:13 +02:00
|
|
|
return filename if index==0 else (filename[:-4] + '_old_' +
|
2016-06-15 12:42:08 +02:00
|
|
|
(str(0) if index<10 else str()) + str(index) + filename[-4:])
|
|
|
|
i = 0
|
|
|
|
while os.path.isfile(get_filename(i)):
|
|
|
|
i = i + 1
|
|
|
|
for index in range(i, 0, -1):
|
2016-06-26 09:43:02 +02:00
|
|
|
os.rename(get_filename(index-1), get_filename(index))
|
2016-08-04 19:36:36 +02:00
|
|
|
try:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(pcaption + ' updated', end=' ', flush=True, quiet=quiet)
|
2016-08-04 19:36:36 +02:00
|
|
|
except UnicodeEncodeError:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log('txt updated', end=' ', flush=True, quiet=quiet)
|
2016-08-03 13:51:25 +02:00
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
2016-08-04 19:36:36 +02:00
|
|
|
try:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(pcaption, end=' ', flush=True, quiet=quiet)
|
2016-08-04 19:36:36 +02:00
|
|
|
except UnicodeEncodeError:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log('txt', end=' ', flush=True, quiet=quiet)
|
2016-06-15 12:42:08 +02:00
|
|
|
os.makedirs(name.lower(), exist_ok=True)
|
2016-08-01 18:10:35 +02:00
|
|
|
with open(filename, 'wb') as text_file:
|
|
|
|
shutil.copyfileobj(BytesIO(caption), text_file)
|
2016-06-15 12:42:08 +02:00
|
|
|
os.utime(filename, (datetime.datetime.now().timestamp(), date_epoch))
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2017-04-22 17:53:28 +02:00
|
|
|
def save_location(name: str, location_json: Dict[str, str], date_epoch: Real, quiet: bool = False) -> None:
|
2016-09-22 18:28:13 +02:00
|
|
|
filename = name.lower() + '/' + _epoch_to_string(date_epoch) + '_location.txt'
|
|
|
|
location_string = location_json["name"]+"\n" + \
|
|
|
|
"https://maps.google.com/maps?q={0},{1}&ll={0},{1}\n" \
|
|
|
|
.format(location_json["lat"], location_json["lng"])
|
|
|
|
os.makedirs(name.lower(), exist_ok=True)
|
|
|
|
with open(filename, 'wb') as text_file:
|
|
|
|
shutil.copyfileobj(BytesIO(location_string.encode()), text_file)
|
|
|
|
os.utime(filename, (datetime.datetime.now().timestamp(), date_epoch))
|
2017-04-22 17:53:28 +02:00
|
|
|
_log('geo', end=' ', flush=True, quiet=quiet)
|
2016-09-22 18:28:13 +02:00
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def download_profilepic(name: str, url: str, quiet: bool = False) -> None:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Downloads and saves profile pic with given url."""
|
2016-06-15 12:42:08 +02:00
|
|
|
date_object = datetime.datetime.strptime(requests.head(url).headers["Last-Modified"], \
|
|
|
|
'%a, %d %b %Y %H:%M:%S GMT')
|
2016-09-18 16:35:25 +02:00
|
|
|
filename = name.lower() + '/' + _epoch_to_string(date_object.timestamp()) + \
|
2016-06-15 12:42:08 +02:00
|
|
|
'_UTC_profile_pic.' + url[-3:]
|
|
|
|
if os.path.isfile(filename):
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(filename + ' already exists', quiet=quiet)
|
2016-06-15 12:42:08 +02:00
|
|
|
return None
|
2016-08-03 20:29:36 +02:00
|
|
|
match = re.search('http.*://.*instagram.*[^/]*\\.(com|net)/[^/]+/.', url)
|
2016-06-27 16:49:00 +02:00
|
|
|
if match is None:
|
2016-07-29 18:03:19 +02:00
|
|
|
raise ConnectionException("URL \'" + url + "\' could not be processed.")
|
2016-06-27 16:49:00 +02:00
|
|
|
index = len(match.group(0))-1
|
|
|
|
offset = 8 if match.group(0)[-1:] == 's' else 0
|
2016-06-15 12:42:08 +02:00
|
|
|
url = url[:index] + 's2048x2048' + ('/' if offset == 0 else str()) + url[index+offset:]
|
2016-07-25 23:43:41 +02:00
|
|
|
resp = get_anonymous_session().get(url, stream=True)
|
2016-06-27 16:49:00 +02:00
|
|
|
if resp.status_code == 200:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(filename, quiet=quiet)
|
2016-06-15 12:42:08 +02:00
|
|
|
os.makedirs(name.lower(), exist_ok=True)
|
2016-06-27 16:49:00 +02:00
|
|
|
with open(filename, 'wb') as file:
|
|
|
|
resp.raw.decode_content = True
|
|
|
|
shutil.copyfileobj(resp.raw, file)
|
2016-06-15 12:42:08 +02:00
|
|
|
os.utime(filename, (datetime.datetime.now().timestamp(), date_object.timestamp()))
|
|
|
|
else:
|
2016-07-29 18:03:19 +02:00
|
|
|
raise ConnectionException("File \'" + url + "\' could not be downloaded.")
|
2016-06-15 12:42:08 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_default_session_filename(username: str) -> str:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Returns default session filename for given username."""
|
2016-07-26 17:36:21 +02:00
|
|
|
dirname = tempfile.gettempdir() + "/" + ".instaloader-" + getpass.getuser()
|
|
|
|
filename = dirname + "/" + "session-" + username
|
|
|
|
return filename
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def save_session(session: requests.Session, username: str, filename: Optional[str] = None, quiet: bool = False) -> None:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Saves requests.Session object."""
|
2016-06-21 19:37:14 +02:00
|
|
|
if filename is None:
|
2016-07-26 17:36:21 +02:00
|
|
|
filename = get_default_session_filename(username)
|
|
|
|
dirname = os.path.dirname(filename)
|
2017-03-21 14:58:13 +01:00
|
|
|
if dirname != '' and not os.path.exists(dirname):
|
2016-07-26 17:36:21 +02:00
|
|
|
os.makedirs(dirname)
|
|
|
|
os.chmod(dirname, 0o700)
|
2016-07-25 23:04:48 +02:00
|
|
|
with open(filename, 'wb') as sessionfile:
|
2016-07-15 15:52:21 +02:00
|
|
|
os.chmod(filename, 0o600)
|
2016-07-25 23:04:48 +02:00
|
|
|
pickle.dump(requests.utils.dict_from_cookiejar(session.cookies), sessionfile)
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Saved session to %s." % filename, quiet=quiet)
|
|
|
|
|
2016-06-21 19:37:14 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def load_session(username: str, filename: Optional[str] = None, quiet: bool = False) -> requests.Session:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Returns loaded requests.Session object, or None if not found."""
|
2016-06-21 19:37:14 +02:00
|
|
|
if filename is None:
|
2016-07-26 17:36:21 +02:00
|
|
|
filename = get_default_session_filename(username)
|
2016-07-26 17:03:20 +02:00
|
|
|
try:
|
|
|
|
with open(filename, 'rb') as sessionfile:
|
|
|
|
session = requests.Session()
|
|
|
|
session.cookies = requests.utils.cookiejar_from_dict(pickle.load(sessionfile))
|
|
|
|
session.headers.update(default_http_header())
|
2016-07-28 15:43:48 +02:00
|
|
|
session.headers.update({'X-CSRFToken':session.cookies.get_dict()['csrftoken']})
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Loaded session from %s." % filename, quiet=quiet)
|
2016-07-26 17:03:20 +02:00
|
|
|
return session
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
2016-06-21 19:37:14 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def copy_session(session: requests.Session) -> requests.Session:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Duplicates a requests.Session."""
|
2016-07-28 15:45:31 +02:00
|
|
|
new = requests.Session()
|
|
|
|
new.cookies = \
|
|
|
|
requests.utils.cookiejar_from_dict(requests.utils.dict_from_cookiejar(session.cookies))
|
|
|
|
new.headers = session.headers
|
|
|
|
return new
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def test_login(session: requests.Session) -> Optional[str]:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Returns the Instagram username to which given requests.Session object belongs, or None."""
|
2016-07-28 17:44:02 +02:00
|
|
|
if session is None:
|
|
|
|
return
|
|
|
|
data = get_json(str(), session)
|
|
|
|
if data['config']['viewer'] is None:
|
|
|
|
return
|
2016-06-21 19:37:14 +02:00
|
|
|
time.sleep(4 * random.random() + 1)
|
2016-07-28 17:44:02 +02:00
|
|
|
return data['config']['viewer']['username']
|
2016-06-21 19:37:14 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def default_http_header(empty_session_only: bool = False) -> Dict[str, str]:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Returns default HTTP header we use for requests."""
|
2016-07-25 23:43:41 +02:00
|
|
|
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
|
|
'(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36'
|
|
|
|
header = { 'Accept-Encoding' : 'gzip, deflate', \
|
|
|
|
'Accept-Language' : 'en-US,en;q=0.8', \
|
|
|
|
'Connection' : 'keep-alive', \
|
|
|
|
'Content-Length' : '0', \
|
|
|
|
'Host' : 'www.instagram.com', \
|
|
|
|
'Origin' : 'https://www.instagram.com', \
|
|
|
|
'Referer' : 'https://www.instagram.com/', \
|
|
|
|
'User-Agent' : user_agent, \
|
|
|
|
'X-Instagram-AJAX' : '1', \
|
|
|
|
'X-Requested-With' : 'XMLHttpRequest'}
|
2016-06-27 16:49:00 +02:00
|
|
|
if empty_session_only:
|
2016-07-25 23:43:41 +02:00
|
|
|
del header['Host']
|
|
|
|
del header['Origin']
|
|
|
|
del header['Referer']
|
|
|
|
del header['X-Instagram-AJAX']
|
|
|
|
del header['X-Requested-With']
|
|
|
|
return header
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_anonymous_session() -> requests.Session:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Returns our default anonymous requests.Session object."""
|
2016-07-25 23:43:41 +02:00
|
|
|
session = requests.Session()
|
|
|
|
session.cookies.update({'sessionid' : '', 'mid' : '', 'ig_pr' : '1', \
|
|
|
|
'ig_vw' : '1920', 'csrftoken' : '', \
|
|
|
|
's_network' : '', 'ds_user_id' : ''})
|
|
|
|
session.headers.update(default_http_header(empty_session_only=True))
|
|
|
|
return session
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_session(user: str, passwd: str) -> requests.Session:
|
2016-07-25 23:43:41 +02:00
|
|
|
"""Log in to instagram with given username and password and return session object"""
|
|
|
|
session = requests.Session()
|
|
|
|
session.cookies.update({'sessionid' : '', 'mid' : '', 'ig_pr' : '1', \
|
|
|
|
'ig_vw' : '1920', 'csrftoken' : '', \
|
|
|
|
's_network' : '', 'ds_user_id' : ''})
|
|
|
|
session.headers.update(default_http_header())
|
2016-06-27 16:49:00 +02:00
|
|
|
resp = session.get('https://www.instagram.com/')
|
|
|
|
session.headers.update({'X-CSRFToken':resp.cookies['csrftoken']})
|
2016-06-21 19:37:14 +02:00
|
|
|
time.sleep(9 * random.random() + 3)
|
|
|
|
login = session.post('https://www.instagram.com/accounts/login/ajax/', \
|
|
|
|
data={'password':passwd,'username':user}, allow_redirects=True)
|
|
|
|
session.headers.update({'X-CSRFToken':login.cookies['csrftoken']})
|
|
|
|
time.sleep(5 * random.random())
|
|
|
|
if login.status_code == 200:
|
2016-07-28 17:44:02 +02:00
|
|
|
if user == test_login(session):
|
2016-07-25 20:19:07 +02:00
|
|
|
return session
|
2016-06-21 19:37:14 +02:00
|
|
|
else:
|
2016-07-26 10:57:29 +02:00
|
|
|
raise BadCredentialsException('Login error! Check your credentials!')
|
2016-06-21 19:37:14 +02:00
|
|
|
else:
|
2016-07-26 10:57:29 +02:00
|
|
|
raise ConnectionException('Login error! Connection error!')
|
2016-06-21 19:37:14 +02:00
|
|
|
|
2016-09-16 23:20:37 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_feed_json(session: requests.Session, end_cursor: str = None, sleep: bool = True) -> Dict[str, Any]:
|
2016-09-16 23:20:37 +02:00
|
|
|
"""
|
|
|
|
Get JSON of the user's feed.
|
|
|
|
|
|
|
|
:param session: Session belonging to a user, i.e. not an anonymous session
|
|
|
|
:param end_cursor: The end cursor, as from json["feed"]["media"]["page_info"]["end_cursor"]
|
|
|
|
:param sleep: Sleep between requests to instagram server
|
|
|
|
:return: JSON
|
|
|
|
"""
|
|
|
|
if end_cursor is None:
|
|
|
|
return get_json(str(), session, sleep=sleep)["entry_data"]["FeedPage"][0]
|
|
|
|
tmpsession = copy_session(session)
|
|
|
|
query = "q=ig_me()+%7B%0A++feed+%7B%0A++++media.after(" + end_cursor + "%2C+12)+%7B%0A"+\
|
|
|
|
"++++++nodes+%7B%0A++++++++id%2C%0A++++++++caption%2C%0A++++++++code%2C%0A++++++++"+\
|
|
|
|
"comments.last(4)+%7B%0A++++++++++count%2C%0A++++++++++nodes+%7B%0A++++++++++++"+\
|
|
|
|
"id%2C%0A++++++++++++created_at%2C%0A++++++++++++text%2C%0A++++++++++++"+\
|
|
|
|
"user+%7B%0A++++++++++++++id%2C%0A++++++++++++++profile_pic_url%2C%0A++++++++++++++"+\
|
|
|
|
"username%0A++++++++++++%7D%0A++++++++++%7D%2C%0A++++++++++"+\
|
|
|
|
"page_info%0A++++++++%7D%2C%0A++++++++comments_disabled%2C%0A++++++++"+\
|
|
|
|
"date%2C%0A++++++++dimensions+%7B%0A++++++++++height%2C%0A++++++++++"+\
|
|
|
|
"width%0A++++++++%7D%2C%0A++++++++display_src%2C%0A++++++++is_video%2C%0A++++++++"+\
|
|
|
|
"likes+%7B%0A++++++++++count%2C%0A++++++++++nodes+%7B%0A++++++++++++"+\
|
|
|
|
"user+%7B%0A++++++++++++++id%2C%0A++++++++++++++profile_pic_url%2C%0A++++++++++++++"+\
|
|
|
|
"username%0A++++++++++++%7D%0A++++++++++%7D%2C%0A++++++++++"+\
|
|
|
|
"viewer_has_liked%0A++++++++%7D%2C%0A++++++++location+%7B%0A++++++++++"+\
|
|
|
|
"id%2C%0A++++++++++has_public_page%2C%0A++++++++++name%0A++++++++%7D%2C%0A++++++++"+\
|
|
|
|
"owner+%7B%0A++++++++++id%2C%0A++++++++++blocked_by_viewer%2C%0A++++++++++"+\
|
|
|
|
"followed_by_viewer%2C%0A++++++++++full_name%2C%0A++++++++++"+\
|
|
|
|
"has_blocked_viewer%2C%0A++++++++++is_private%2C%0A++++++++++"+\
|
|
|
|
"profile_pic_url%2C%0A++++++++++requested_by_viewer%2C%0A++++++++++"+\
|
|
|
|
"username%0A++++++++%7D%2C%0A++++++++usertags+%7B%0A++++++++++"+\
|
|
|
|
"nodes+%7B%0A++++++++++++user+%7B%0A++++++++++++++"+\
|
|
|
|
"username%0A++++++++++++%7D%2C%0A++++++++++++x%2C%0A++++++++++++y%0A++++++++++"+\
|
|
|
|
"%7D%0A++++++++%7D%2C%0A++++++++video_url%2C%0A++++++++"+\
|
|
|
|
"video_views%0A++++++%7D%2C%0A++++++page_info%0A++++%7D%0A++%7D%2C%0A++id%2C%0A++"+\
|
|
|
|
"profile_pic_url%2C%0A++username%0A%7D%0A&ref=feed::show"
|
|
|
|
tmpsession.headers.update(default_http_header())
|
|
|
|
tmpsession.headers.update({'Referer' : 'https://www.instagram.com/'})
|
2017-04-21 18:01:20 +02:00
|
|
|
tmpsession.headers.update({'Content-Type' : 'application/x-www-form-urlencoded'})
|
2016-09-16 23:20:37 +02:00
|
|
|
resp = tmpsession.post('https://www.instagram.com/query/', data=query)
|
|
|
|
if sleep:
|
|
|
|
time.sleep(4 * random.random() + 1)
|
|
|
|
return json.loads(resp.text)
|
|
|
|
|
|
|
|
|
2017-04-22 17:53:28 +02:00
|
|
|
def get_location(session: requests.Session, node_code: str, sleep: bool = True) -> Dict[str, str]:
|
|
|
|
pic_json = get_json("p/" + node_code, session, sleep=sleep)
|
|
|
|
media = pic_json["entry_data"]["PostPage"][0]["graphql"]["shortcode_media"] \
|
|
|
|
if "graphql" in pic_json["entry_data"]["PostPage"][0] \
|
|
|
|
else pic_json["entry_data"]["PostPage"][0]["media"]
|
|
|
|
if media["location"] is not None:
|
2016-09-22 18:28:13 +02:00
|
|
|
location_json = get_json("explore/locations/" +
|
2017-04-22 17:53:28 +02:00
|
|
|
media["location"]["id"],
|
2016-09-22 18:28:13 +02:00
|
|
|
session, sleep=sleep)
|
|
|
|
return location_json["entry_data"]["LocationsPage"][0]["location"]
|
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def download_node(node: Dict[str, Any], session: requests.Session, name: str,
|
|
|
|
download_videos: bool = True, geotags: bool = False,
|
|
|
|
sleep: bool = True, shorter_output: bool = False, quiet: bool = False) -> bool:
|
2016-09-18 15:39:52 +02:00
|
|
|
"""
|
|
|
|
Download everything associated with one instagram node, i.e. picture, caption and video.
|
|
|
|
|
|
|
|
:param node: Node, as from media->nodes list in instagram's JSONs
|
|
|
|
:param session: Session
|
|
|
|
:param name: Name of profile to which this node belongs
|
|
|
|
:param download_videos: True, if videos should be downloaded
|
2016-12-22 13:20:41 +01:00
|
|
|
:param geotags: Download geotags
|
2016-09-18 15:39:52 +02:00
|
|
|
:param sleep: Sleep between requests to instagram server
|
|
|
|
:param shorter_output: Shorten log output by not printing captions
|
|
|
|
:param quiet: Suppress output
|
|
|
|
:return: True if something was downloaded, False otherwise, i.e. file was already there
|
|
|
|
"""
|
2017-04-21 18:01:20 +02:00
|
|
|
# pylint:disable=too-many-branches,too-many-locals
|
|
|
|
date = node["date"] if "date" in node else node["taken_at_timestamp"]
|
|
|
|
if '__typename' in node:
|
|
|
|
if node['__typename'] == 'GraphSidecar':
|
|
|
|
sidecar_data = session.get('https://www.instagram.com/p/' + node['code'] + '/', params={'__a': 1}).json()
|
|
|
|
edge_number = 1
|
2017-04-22 10:50:12 +02:00
|
|
|
downloaded = True
|
|
|
|
media = sidecar_data["graphql"]["shortcode_media"] if "graphql" in sidecar_data else sidecar_data["media"]
|
|
|
|
for edge in media['edge_sidecar_to_children']['edges']:
|
2017-04-21 18:01:20 +02:00
|
|
|
edge_downloaded = download_pic(name, edge['node']['display_url'],date,
|
|
|
|
filename_suffix=str(edge_number), quiet=quiet,
|
|
|
|
outputlabel=(str(edge_number) if edge_number != 1 else None))
|
2017-04-22 10:50:12 +02:00
|
|
|
downloaded = downloaded and edge_downloaded
|
2017-04-21 18:01:20 +02:00
|
|
|
edge_number += 1
|
|
|
|
if sleep:
|
|
|
|
time.sleep(1.75 * random.random() + 0.25)
|
2017-04-22 01:39:52 +02:00
|
|
|
elif node['__typename'] in ['GraphImage', 'GraphVideo']:
|
|
|
|
downloaded = download_pic(name, node["display_url"] if "display_url" in node else node["display_src"],
|
|
|
|
date, quiet=quiet)
|
2017-03-25 21:08:54 +01:00
|
|
|
if sleep:
|
|
|
|
time.sleep(1.75 * random.random() + 0.25)
|
2017-04-21 18:01:20 +02:00
|
|
|
else:
|
|
|
|
_log("Warning: Unknown typename discovered:" + node['__typename'])
|
|
|
|
downloaded = False
|
2017-03-25 21:08:54 +01:00
|
|
|
else:
|
2017-04-21 18:01:20 +02:00
|
|
|
# Node is an old image or video.
|
|
|
|
downloaded = download_pic(name, node["display_src"], date, quiet=quiet)
|
2017-03-25 21:08:54 +01:00
|
|
|
if sleep:
|
|
|
|
time.sleep(1.75 * random.random() + 0.25)
|
2017-04-21 18:01:20 +02:00
|
|
|
if "edge_media_to_caption" in node and node["edge_media_to_caption"]["edges"]:
|
|
|
|
save_caption(name, date, node["edge_media_to_caption"]["edges"][0]["node"]["text"], shorter_output, quiet)
|
|
|
|
elif "caption" in node:
|
|
|
|
save_caption(name, date, node["caption"], shorter_output, quiet)
|
2016-09-18 15:39:52 +02:00
|
|
|
else:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("<no caption>", end=' ', flush=True, quiet=quiet)
|
2017-04-22 17:53:28 +02:00
|
|
|
node_code = node['shortcode'] if 'shortcode' in node else node['code']
|
2016-09-18 15:39:52 +02:00
|
|
|
if node["is_video"] and download_videos:
|
2017-04-22 17:53:28 +02:00
|
|
|
video_data = get_json('p/' + node_code, session, sleep=sleep)
|
2016-09-18 15:39:52 +02:00
|
|
|
download_pic(name,
|
2017-04-20 09:17:59 +02:00
|
|
|
video_data['entry_data']['PostPage'][0]['graphql']['shortcode_media']['video_url'],
|
2017-04-21 18:01:20 +02:00
|
|
|
date, 'mp4', quiet=quiet)
|
2016-09-22 18:28:13 +02:00
|
|
|
if geotags:
|
2017-04-22 17:53:28 +02:00
|
|
|
location = get_location(session, node_code, sleep)
|
2016-09-22 18:28:13 +02:00
|
|
|
if location:
|
2017-04-22 17:53:28 +02:00
|
|
|
save_location(name, location, date, quiet=quiet)
|
2016-09-18 16:35:25 +02:00
|
|
|
_log(quiet=quiet)
|
2016-09-18 15:39:52 +02:00
|
|
|
return downloaded
|
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def download_feed_pics(session: requests.Session, max_count: int = None, fast_update: bool = False,
|
2016-12-22 16:05:25 +01:00
|
|
|
filter_func: Optional[Callable[[Dict[str, Dict[str, Any]]], bool]] = None,
|
2016-12-22 13:20:41 +01:00
|
|
|
download_videos: bool = True, geotags: bool = False,
|
|
|
|
shorter_output: bool = False, sleep: bool = True, quiet: bool = False) -> None:
|
2016-09-16 23:20:37 +02:00
|
|
|
"""
|
|
|
|
Download pictures from the user's feed.
|
|
|
|
|
|
|
|
Example to download up to the 20 pics the user last liked:
|
|
|
|
>>> download_feed_pics(load_session('USER'), max_count=20, fast_update=True,
|
2017-04-21 18:01:20 +02:00
|
|
|
>>> filter_func=lambda node:
|
|
|
|
>>> not node["likes"]["viewer_has_liked"] if "likes" in node else not node["viewer_has_liked"])
|
2016-09-16 23:20:37 +02:00
|
|
|
|
|
|
|
:param session: Session belonging to a user, i.e. not an anonymous session
|
|
|
|
:param max_count: Maximum count of pictures to download
|
|
|
|
:param fast_update: If true, abort when first already-downloaded picture is encountered
|
|
|
|
:param filter_func: function(node), which returns True if given picture should not be downloaded
|
|
|
|
:param download_videos: True, if videos should be downloaded
|
2016-12-22 13:20:41 +01:00
|
|
|
:param geotags: Download geotags
|
2016-09-16 23:20:37 +02:00
|
|
|
:param shorter_output: Shorten log output by not printing captions
|
|
|
|
:param sleep: Sleep between requests to instagram server
|
|
|
|
:param quiet: Suppress output
|
|
|
|
"""
|
2017-04-21 18:01:20 +02:00
|
|
|
# pylint:disable=too-many-locals
|
2016-09-16 23:20:37 +02:00
|
|
|
data = get_feed_json(session, sleep=sleep)
|
|
|
|
count = 1
|
2017-04-21 18:01:20 +02:00
|
|
|
while True:
|
|
|
|
if "graphql" in data:
|
|
|
|
is_edge = True
|
|
|
|
feed = data["graphql"]["user"]["edge_web_feed_timeline"]
|
|
|
|
else:
|
|
|
|
is_edge = False
|
|
|
|
feed = data["feed"]["media"]
|
|
|
|
for edge_or_node in feed["edges"] if is_edge else feed["nodes"]:
|
2016-09-16 23:20:37 +02:00
|
|
|
if max_count is not None and count > max_count:
|
|
|
|
return
|
2017-04-21 18:01:20 +02:00
|
|
|
node = edge_or_node["node"] if is_edge else edge_or_node
|
2016-09-16 23:20:37 +02:00
|
|
|
name = node["owner"]["username"]
|
|
|
|
if filter_func is not None and filter_func(node):
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("<pic by %s skipped>" % name, flush=True, quiet=quiet)
|
2016-09-16 23:20:37 +02:00
|
|
|
continue
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("[%3i] %s " % (count, name), end="", flush=True, quiet=quiet)
|
2016-09-16 23:20:37 +02:00
|
|
|
count += 1
|
2016-09-18 15:39:52 +02:00
|
|
|
downloaded = download_node(node, session, name,
|
2016-09-22 18:28:13 +02:00
|
|
|
download_videos=download_videos, geotags=geotags,
|
|
|
|
sleep=sleep, shorter_output=shorter_output, quiet=quiet)
|
2016-09-16 23:20:37 +02:00
|
|
|
if fast_update and not downloaded:
|
|
|
|
return
|
2017-04-21 18:01:20 +02:00
|
|
|
if not feed["page_info"]["has_next_page"]:
|
|
|
|
break
|
|
|
|
data = get_feed_json(session, end_cursor=feed["page_info"]["end_cursor"], sleep=sleep)
|
2016-09-16 23:20:37 +02:00
|
|
|
|
|
|
|
|
2017-04-17 12:10:43 +02:00
|
|
|
def get_hashtag_json(hashtag: str, session: requests.Session,
|
2017-04-20 16:47:31 +02:00
|
|
|
max_id: Optional[str] = None, sleep: bool = True) -> Optional[Dict[str, Any]]:
|
2017-04-17 12:10:43 +02:00
|
|
|
"""Return JSON of a #hashtag"""
|
|
|
|
return get_json(name='explore/tags/{0}/'.format(hashtag), session=session, max_id=max_id, sleep=sleep)
|
|
|
|
|
|
|
|
|
|
|
|
def download_hashtag(hashtag: str, session: requests.Session,
|
|
|
|
max_count: Optional[int] = None,
|
|
|
|
filter_func: Optional[Callable[[Dict[str, Dict[str, Any]]], bool]] = None,
|
|
|
|
fast_update: bool = False, download_videos: bool = True, geotags: bool = False,
|
|
|
|
shorter_output: bool = False, sleep: bool = True, quiet: bool = False) -> None:
|
|
|
|
"""Download pictures of one hashtag.
|
|
|
|
|
|
|
|
To download the last 30 pictures with hashtag #cat, do
|
|
|
|
>>> download_hashtag('cat', session=get_anonymous_session(), max_count=30)
|
|
|
|
|
|
|
|
:param hashtag: Hashtag to download, without leading '#'
|
|
|
|
:param session: Session belonging to a user, i.e. not an anonymous session
|
|
|
|
:param max_count: Maximum count of pictures to download
|
|
|
|
:param filter_func: function(node), which returns True if given picture should not be downloaded
|
|
|
|
:param fast_update: If true, abort when first already-downloaded picture is encountered
|
|
|
|
:param download_videos: True, if videos should be downloaded
|
|
|
|
:param geotags: Download geotags
|
|
|
|
:param shorter_output: Shorten log output by not printing captions
|
|
|
|
:param sleep: Sleep between requests to instagram server
|
|
|
|
:param quiet: Suppress output
|
|
|
|
"""
|
|
|
|
data = get_hashtag_json(hashtag, session, sleep=sleep)
|
|
|
|
count = 1
|
|
|
|
while data:
|
|
|
|
for node in data['entry_data']['TagPage'][0]['tag']['media']['nodes']:
|
|
|
|
if max_count is not None and count > max_count:
|
|
|
|
return
|
|
|
|
_log('[{0:3d}] #{1} '.format(count, hashtag), end='', flush=True, quiet=quiet)
|
|
|
|
if filter_func is not None and filter_func(node):
|
|
|
|
_log('<skipped>', quiet=quiet)
|
|
|
|
continue
|
2017-04-22 17:34:49 +02:00
|
|
|
count += 1
|
2017-04-17 12:10:43 +02:00
|
|
|
downloaded = download_node(node, session, '#{0}'.format(hashtag),
|
|
|
|
download_videos=download_videos, geotags=geotags, sleep=sleep,
|
|
|
|
shorter_output=shorter_output, quiet=quiet)
|
|
|
|
if fast_update and not downloaded:
|
|
|
|
return
|
|
|
|
if data['entry_data']['TagPage'][0]['tag']['media']['page_info']['has_next_page']:
|
|
|
|
data = get_hashtag_json(hashtag, session, sleep=sleep,
|
|
|
|
max_id=data['entry_data']['TagPage'][0]['tag']['media']['page_info']['end_cursor'])
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def check_id(profile: str, session: requests.Session, json_data: Dict[str, Any], quiet: bool = False) -> str:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""
|
|
|
|
Consult locally stored ID of profile with given name, check whether ID matches and whether name
|
|
|
|
has changed and return current name of the profile, and store ID of profile.
|
|
|
|
"""
|
|
|
|
profile_exists = len(json_data["entry_data"]) > 0 and "ProfilePage" in json_data["entry_data"]
|
|
|
|
is_logged_in = json_data["config"]["viewer"] is not None
|
|
|
|
try:
|
|
|
|
with open(profile + "/id", 'rb') as id_file:
|
|
|
|
profile_id = int(id_file.read())
|
|
|
|
if (not profile_exists) or \
|
|
|
|
(profile_id != int(json_data['entry_data']['ProfilePage'][0]['user']['id'])):
|
|
|
|
if is_logged_in:
|
|
|
|
newname = get_username_by_id(session, profile_id)
|
|
|
|
_log("Profile {0} has changed its name to {1}.".format(profile, newname),
|
|
|
|
quiet=quiet)
|
|
|
|
os.rename(profile, newname)
|
|
|
|
return newname
|
|
|
|
if profile_exists:
|
|
|
|
raise ProfileNotExistsException("Profile {0} does not match the stored "
|
|
|
|
"unique ID {1}.".format(profile, profile_id))
|
|
|
|
raise ProfileNotExistsException("Profile {0} does not exist. Please login to "
|
|
|
|
"update profile name. Unique ID: {1}."
|
|
|
|
.format(profile, profile_id))
|
|
|
|
return profile
|
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
if profile_exists:
|
|
|
|
os.makedirs(profile.lower(), exist_ok=True)
|
|
|
|
with open(profile + "/id", 'w') as text_file:
|
|
|
|
profile_id = json_data['entry_data']['ProfilePage'][0]['user']['id']
|
|
|
|
text_file.write(profile_id+"\n")
|
|
|
|
_log("Stored ID {0} for profile {1}.".format(profile_id, profile), quiet=quiet)
|
|
|
|
return profile
|
|
|
|
raise ProfileNotExistsException("Profile {0} does not exist.".format(profile))
|
|
|
|
|
|
|
|
|
2017-04-17 12:10:43 +02:00
|
|
|
def download(name: str, session: requests.Session,
|
|
|
|
profile_pic_only: bool = False, download_videos: bool = True, geotags: bool = False,
|
|
|
|
fast_update: bool = False, shorter_output: bool = False, sleep: bool = True,
|
|
|
|
quiet: bool = False) -> None:
|
2016-07-25 22:27:23 +02:00
|
|
|
"""Download one profile"""
|
2016-09-22 18:28:13 +02:00
|
|
|
# pylint:disable=too-many-branches,too-many-locals
|
2016-07-25 22:27:23 +02:00
|
|
|
# Get profile main page json
|
2016-08-03 20:25:16 +02:00
|
|
|
data = get_json(name, session, sleep=sleep)
|
2016-08-02 18:54:30 +02:00
|
|
|
# check if profile does exist or name has changed since last download
|
|
|
|
# and update name and json data if necessary
|
|
|
|
name_updated = check_id(name, session, data, quiet=quiet)
|
|
|
|
if name_updated != name:
|
|
|
|
name = name_updated
|
2016-08-03 20:25:16 +02:00
|
|
|
data = get_json(name, session, sleep=sleep)
|
2016-07-25 22:27:23 +02:00
|
|
|
# Download profile picture
|
|
|
|
download_profilepic(name, data["entry_data"]["ProfilePage"][0]["user"]["profile_pic_url"],
|
|
|
|
quiet=quiet)
|
2016-08-03 20:25:16 +02:00
|
|
|
if sleep:
|
|
|
|
time.sleep(1.75 * random.random() + 0.25)
|
2016-07-25 22:27:23 +02:00
|
|
|
if profile_pic_only:
|
|
|
|
return
|
|
|
|
# Catch some errors
|
|
|
|
if data["entry_data"]["ProfilePage"][0]["user"]["is_private"]:
|
2016-07-25 23:58:49 +02:00
|
|
|
if data["config"]["viewer"] is None:
|
2016-07-26 13:53:01 +02:00
|
|
|
raise LoginRequiredException("profile %s requires login" % name)
|
2016-07-25 22:27:23 +02:00
|
|
|
if not data["entry_data"]["ProfilePage"][0]["user"]["followed_by_viewer"]:
|
2016-07-29 18:03:19 +02:00
|
|
|
raise PrivateProfileNotFollowedException("Profile %s: private but not followed." % name)
|
2016-07-26 13:53:32 +02:00
|
|
|
else:
|
|
|
|
if data["config"]["viewer"] is not None:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("profile %s could also be downloaded anonymously." % name, quiet=quiet)
|
2016-07-25 22:27:23 +02:00
|
|
|
if ("nodes" not in data["entry_data"]["ProfilePage"][0]["user"]["media"] or
|
2017-04-20 16:47:31 +02:00
|
|
|
not data["entry_data"]["ProfilePage"][0]["user"]["media"]["nodes"]) \
|
2016-07-25 22:27:23 +02:00
|
|
|
and not profile_pic_only:
|
2016-07-29 18:03:19 +02:00
|
|
|
raise ProfileHasNoPicsException("Profile %s: no pics found." % name)
|
2016-07-25 22:27:23 +02:00
|
|
|
# Iterate over pictures and download them
|
2016-09-18 16:35:25 +02:00
|
|
|
def get_last_id(data):
|
2017-04-20 16:47:31 +02:00
|
|
|
if data["entry_data"] and data["entry_data"]["ProfilePage"][0]["user"]["media"]["nodes"]:
|
|
|
|
return data["entry_data"]["ProfilePage"][0]["user"]["media"]["nodes"][-1]["id"]
|
2016-06-21 19:37:14 +02:00
|
|
|
totalcount = data["entry_data"]["ProfilePage"][0]["user"]["media"]["count"]
|
2016-07-25 22:27:23 +02:00
|
|
|
count = 1
|
|
|
|
while get_last_id(data) is not None:
|
|
|
|
for node in data["entry_data"]["ProfilePage"][0]["user"]["media"]["nodes"]:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("[%3i/%3i] " % (count, totalcount), end="", flush=True, quiet=quiet)
|
2016-09-18 15:39:52 +02:00
|
|
|
count += 1
|
|
|
|
downloaded = download_node(node, session, name,
|
2016-09-22 18:28:13 +02:00
|
|
|
download_videos=download_videos, geotags=geotags,
|
|
|
|
sleep=sleep, shorter_output=shorter_output, quiet=quiet)
|
2016-07-25 22:27:23 +02:00
|
|
|
if fast_update and not downloaded:
|
|
|
|
return
|
2016-08-03 20:25:16 +02:00
|
|
|
data = get_json(name, session, max_id=get_last_id(data), sleep=sleep)
|
2016-07-25 22:27:23 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
|
|
|
|
def get_logged_in_session(username: str, password: Optional[str] = None, quiet: bool = False) -> requests.Session:
|
2016-07-25 23:43:41 +02:00
|
|
|
"""Logs in and returns session, asking user for password if needed"""
|
2016-07-25 22:27:23 +02:00
|
|
|
if password is not None:
|
|
|
|
return get_session(username, password)
|
|
|
|
if quiet:
|
|
|
|
raise LoginRequiredException("Quiet mode requires given password or valid "
|
|
|
|
"session file.")
|
|
|
|
while password is None:
|
2016-07-26 13:53:47 +02:00
|
|
|
password = getpass.getpass(prompt="Enter Instagram password for %s: " % username)
|
2016-07-25 22:27:23 +02:00
|
|
|
try:
|
|
|
|
return get_session(username, password)
|
2016-07-26 10:57:29 +02:00
|
|
|
except BadCredentialsException as err:
|
2016-07-25 22:27:23 +02:00
|
|
|
print(err, file=sys.stderr)
|
|
|
|
password = None
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
|
|
|
|
def download_profiles(profilelist: List[str], username: Optional[str] = None, password: Optional[str] = None,
|
2017-04-22 17:21:02 +02:00
|
|
|
sessionfile: Optional[str] = None, max_count: Optional[int] = None,
|
2016-12-22 13:20:41 +01:00
|
|
|
profile_pic_only: bool = False, download_videos: bool = True, geotags: bool = False,
|
|
|
|
fast_update: bool = False,
|
|
|
|
sleep: bool = True, shorter_output: bool = False, quiet: bool = False) -> None:
|
2016-07-25 22:27:23 +02:00
|
|
|
"""Download set of profiles and handle sessions"""
|
2016-09-18 15:43:24 +02:00
|
|
|
# pylint:disable=too-many-branches,too-many-locals
|
2016-07-25 22:27:23 +02:00
|
|
|
# Login, if desired
|
|
|
|
if username is not None:
|
2016-07-26 17:36:21 +02:00
|
|
|
session = load_session(username, sessionfile, quiet=quiet)
|
2016-07-28 17:44:02 +02:00
|
|
|
if username != test_login(session):
|
2016-07-25 22:27:23 +02:00
|
|
|
session = get_logged_in_session(username, password, quiet)
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Logged in as %s." % username, quiet=quiet)
|
2016-07-25 23:43:41 +02:00
|
|
|
else:
|
|
|
|
session = get_anonymous_session()
|
2016-07-29 17:59:07 +02:00
|
|
|
# Try block for KeyboardInterrupt (save session on ^C)
|
2016-07-25 22:27:23 +02:00
|
|
|
failedtargets = []
|
2016-07-29 17:59:07 +02:00
|
|
|
targets = set()
|
2016-07-26 17:03:32 +02:00
|
|
|
try:
|
2016-07-29 17:59:07 +02:00
|
|
|
# Generate set of targets
|
|
|
|
for pentry in profilelist:
|
2017-04-17 12:10:43 +02:00
|
|
|
if pentry[0] == '#':
|
|
|
|
_log("Retrieving pictures with hashtag {0}".format(pentry), quiet=quiet)
|
2017-04-22 17:21:02 +02:00
|
|
|
download_hashtag(hashtag=pentry[1:], session=session, max_count=max_count, fast_update=fast_update,
|
2017-04-17 12:10:43 +02:00
|
|
|
download_videos=download_videos, geotags=geotags, shorter_output=shorter_output,
|
|
|
|
sleep=sleep, quiet=quiet)
|
|
|
|
elif pentry[0] == '@' and username is not None:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Retrieving followees of %s..." % pentry[1:], quiet=quiet)
|
2016-07-29 17:59:07 +02:00
|
|
|
followees = get_followees(pentry[1:], session)
|
|
|
|
targets.update([followee['username'] for followee in followees])
|
2016-09-17 20:51:17 +02:00
|
|
|
elif pentry == ":feed-all" and username is not None:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Retrieving pictures from your feed...", quiet=quiet)
|
2017-04-22 17:21:02 +02:00
|
|
|
download_feed_pics(session, fast_update=fast_update, max_count=max_count,
|
2016-09-22 18:28:13 +02:00
|
|
|
download_videos=download_videos, geotags=geotags,
|
|
|
|
shorter_output=shorter_output, sleep=sleep, quiet=quiet)
|
2016-09-17 20:51:17 +02:00
|
|
|
elif pentry == ":feed-liked" and username is not None:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Retrieving pictures you liked from your feed...", quiet=quiet)
|
2017-04-22 17:21:02 +02:00
|
|
|
download_feed_pics(session, fast_update=fast_update, max_count=max_count,
|
2017-04-22 17:26:48 +02:00
|
|
|
filter_func=lambda node:
|
|
|
|
not node["likes"]["viewer_has_liked"]
|
|
|
|
if "likes" in node
|
|
|
|
else not node["viewer_has_liked"],
|
2016-09-22 18:28:13 +02:00
|
|
|
download_videos=download_videos, geotags=geotags,
|
|
|
|
shorter_output=shorter_output, sleep=sleep, quiet=quiet)
|
2016-07-29 17:59:07 +02:00
|
|
|
else:
|
|
|
|
targets.add(pentry)
|
2017-04-17 12:10:43 +02:00
|
|
|
if len(targets) > 1:
|
2016-09-18 16:35:25 +02:00
|
|
|
_log("Downloading %i profiles..." % len(targets), quiet=quiet)
|
2016-07-29 17:59:07 +02:00
|
|
|
# Iterate through targets list and download them
|
2016-07-26 17:03:32 +02:00
|
|
|
for target in targets:
|
|
|
|
try:
|
2017-03-19 12:52:07 +01:00
|
|
|
try:
|
|
|
|
download(target, session, profile_pic_only, download_videos,
|
|
|
|
geotags, fast_update, shorter_output, sleep, quiet)
|
|
|
|
except ProfileNotExistsException as err:
|
|
|
|
if username is not None:
|
|
|
|
_log("\"Profile not exists\" - Trying again anonymously, helps in case you are just blocked")
|
|
|
|
download(target, get_anonymous_session(), profile_pic_only, download_videos,
|
|
|
|
geotags, fast_update, shorter_output, sleep, quiet)
|
|
|
|
else:
|
|
|
|
raise err
|
2016-07-26 17:03:32 +02:00
|
|
|
except NonfatalException as err:
|
|
|
|
failedtargets.append(target)
|
|
|
|
print(err, file=sys.stderr)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print("\nInterrupted by user.", file=sys.stderr)
|
2017-04-20 16:47:31 +02:00
|
|
|
if len(targets) > 1 and failedtargets:
|
2016-07-29 18:03:19 +02:00
|
|
|
print("Errors occured (see above) while downloading profiles: %s." %
|
2016-07-25 22:27:23 +02:00
|
|
|
", ".join(failedtargets), file=sys.stderr)
|
|
|
|
# Save session if it is useful
|
|
|
|
if username is not None:
|
2016-07-26 17:36:21 +02:00
|
|
|
save_session(session, username, sessionfile, quiet=quiet)
|
2016-06-15 12:42:08 +02:00
|
|
|
|
2016-06-27 16:49:00 +02:00
|
|
|
def main():
|
2017-02-13 09:57:03 +01:00
|
|
|
parser = ArgumentParser(description=__doc__,
|
|
|
|
epilog="Report issues at https://github.com/Thammus/instaloader/issues.")
|
2017-04-17 12:10:43 +02:00
|
|
|
parser.add_argument('profile', nargs='*', metavar='profile|#hashtag',
|
|
|
|
help='Name of profile or #hashtag to download. '
|
|
|
|
'Alternatively, if --login is given: @<profile> to download all followees of '
|
2016-09-17 20:51:17 +02:00
|
|
|
'<profile>; or the special targets :feed-all or :feed-liked to '
|
2017-03-19 12:51:20 +01:00
|
|
|
'download pictures from your feed (using '
|
2016-09-17 20:51:17 +02:00
|
|
|
'--fast-update is recommended).')
|
2016-07-28 18:30:44 +02:00
|
|
|
parser.add_argument('--version', action='version',
|
2016-09-18 14:38:58 +02:00
|
|
|
version=__version__)
|
2016-07-29 18:03:19 +02:00
|
|
|
parser.add_argument('-l', '--login', metavar='YOUR-USERNAME',
|
|
|
|
help='Login name for your Instagram account. Not needed to download public '\
|
|
|
|
'profiles, but if you want to download private profiles or all followees of '\
|
|
|
|
'some profile, you have to specify a username used to login.')
|
|
|
|
parser.add_argument('-p', '--password', metavar='YOUR-PASSWORD',
|
|
|
|
help='Password for your Instagram account. If --login is given and there is '\
|
|
|
|
'not yet a valid session file, you\'ll be prompted for your password if '\
|
|
|
|
'--password is not given. Specifying this option without --login has no '\
|
|
|
|
'effect.')
|
2016-06-27 15:34:26 +02:00
|
|
|
parser.add_argument('-f', '--sessionfile',
|
2016-07-26 17:36:21 +02:00
|
|
|
help='File to store session key, defaults to '+ \
|
|
|
|
get_default_session_filename("<login_name>"))
|
2016-06-15 12:42:08 +02:00
|
|
|
parser.add_argument('-P', '--profile-pic-only', action='store_true',
|
|
|
|
help='Only download profile picture')
|
|
|
|
parser.add_argument('-V', '--skip-videos', action='store_true',
|
|
|
|
help='Do not download videos')
|
2016-09-22 18:28:13 +02:00
|
|
|
parser.add_argument('-G', '--geotags', action='store_true',
|
|
|
|
help='Store geotags when available')
|
2016-06-17 21:40:55 +02:00
|
|
|
parser.add_argument('-F', '--fast-update', action='store_true',
|
|
|
|
help='Abort at encounter of first already-downloaded picture')
|
2017-04-22 17:21:02 +02:00
|
|
|
parser.add_argument('-c', '--count',
|
2017-04-22 17:34:49 +02:00
|
|
|
help='Do not attempt to download more than COUNT posts. '
|
2017-04-22 17:21:02 +02:00
|
|
|
'Applies only to #hashtag, :feed-all and :feed-liked.')
|
2016-06-21 19:37:14 +02:00
|
|
|
parser.add_argument('-S', '--no-sleep', action='store_true',
|
|
|
|
help='Do not sleep between actual downloads of pictures')
|
2016-08-04 19:36:36 +02:00
|
|
|
parser.add_argument('-O', '--shorter-output', action='store_true',
|
|
|
|
help='Do not display captions while downloading')
|
2016-06-26 10:39:26 +02:00
|
|
|
parser.add_argument('-q', '--quiet', action='store_true',
|
|
|
|
help='Disable user interaction, i.e. do not print messages (except errors) and fail ' \
|
|
|
|
'if login credentials are needed but not given.')
|
2016-06-15 12:42:08 +02:00
|
|
|
args = parser.parse_args()
|
2016-07-26 10:57:29 +02:00
|
|
|
try:
|
2017-04-22 17:53:28 +02:00
|
|
|
download_profiles(args.profile, args.login, args.password, args.sessionfile,
|
|
|
|
int(args.count) if args.count is not None else None,
|
|
|
|
args.profile_pic_only, not args.skip_videos, args.geotags, args.fast_update,
|
|
|
|
not args.no_sleep, args.shorter_output, args.quiet)
|
2016-07-26 10:57:29 +02:00
|
|
|
except InstaloaderException as err:
|
|
|
|
raise SystemExit("Fatal error: %s" % err)
|
2016-06-27 16:49:00 +02:00
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|