2016-06-15 12:42:08 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
2017-07-20 14:54:22 +02:00
|
|
|
"""Download pictures (or videos) along with their captions and other metadata from Instagram."""
|
2017-08-19 16:14:18 +02:00
|
|
|
import ast
|
2017-06-24 22:43:40 +02:00
|
|
|
import getpass
|
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import pickle
|
2017-08-24 18:30:46 +02:00
|
|
|
import random
|
2017-06-24 22:43:40 +02:00
|
|
|
import re
|
|
|
|
import shutil
|
2017-07-25 18:31:08 +02:00
|
|
|
import string
|
2017-06-24 22:43:40 +02:00
|
|
|
import sys
|
|
|
|
import tempfile
|
2017-08-20 11:48:19 +02:00
|
|
|
import textwrap
|
2017-06-24 22:43:40 +02:00
|
|
|
import time
|
2017-08-19 22:44:08 +02:00
|
|
|
import urllib.parse
|
2017-08-20 11:28:12 +02:00
|
|
|
from argparse import ArgumentParser, SUPPRESS
|
2017-07-06 22:26:25 +02:00
|
|
|
from base64 import b64decode, b64encode
|
2017-08-06 19:27:46 +02:00
|
|
|
from contextlib import contextmanager, suppress
|
2017-07-29 11:08:52 +02:00
|
|
|
from datetime import datetime
|
2017-08-11 19:51:00 +02:00
|
|
|
from enum import Enum
|
|
|
|
|
2016-08-01 18:10:35 +02:00
|
|
|
from io import BytesIO
|
2017-08-06 19:27:46 +02:00
|
|
|
from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
import requests
|
|
|
|
import requests.utils
|
2017-08-01 16:30:59 +02:00
|
|
|
import urllib3
|
2016-12-22 13:20:41 +01:00
|
|
|
|
2016-08-01 18:10:35 +02:00
|
|
|
|
2017-12-11 21:21:07 +01:00
|
|
|
__version__ = '3.2.1'
|
2017-08-11 17:50:37 +02:00
|
|
|
|
2016-09-18 14:38:58 +02:00
|
|
|
|
2016-08-01 18:10:35 +02:00
|
|
|
try:
|
2016-09-18 14:38:58 +02:00
|
|
|
# pylint:disable=wrong-import-position
|
2016-08-01 18:10:35 +02:00
|
|
|
import win_unicode_console
|
|
|
|
except ImportError:
|
|
|
|
WINUNICODE = False
|
|
|
|
else:
|
|
|
|
win_unicode_console.enable()
|
|
|
|
WINUNICODE = True
|
2016-06-15 12:42:08 +02:00
|
|
|
|
2016-08-18 09:58:07 +02:00
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class InstaloaderException(Exception):
|
2017-09-08 10:35:38 +02:00
|
|
|
"""Base exception for this script.
|
|
|
|
|
|
|
|
:note: This exception should not be raised directly."""
|
2016-06-17 21:38:21 +02:00
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
class QueryReturnedNotFoundException(InstaloaderException):
|
2016-07-12 21:04:40 +02:00
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class ProfileNotExistsException(InstaloaderException):
|
2017-04-10 21:05:58 +02:00
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class ProfileHasNoPicsException(InstaloaderException):
|
2016-07-22 15:49:20 +02:00
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class PrivateProfileNotFollowedException(InstaloaderException):
|
2016-07-15 15:54:35 +02:00
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class LoginRequiredException(InstaloaderException):
|
2016-07-25 20:19:07 +02:00
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class InvalidArgumentException(InstaloaderException):
|
2017-07-13 22:33:01 +02:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class BadResponseException(InstaloaderException):
|
2017-07-27 22:18:43 +02:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
2016-07-26 10:57:29 +02:00
|
|
|
class BadCredentialsException(InstaloaderException):
|
|
|
|
pass
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
class ConnectionException(InstaloaderException):
|
2016-07-26 10:57:29 +02:00
|
|
|
pass
|
|
|
|
|
|
|
|
|
2017-08-20 11:48:19 +02:00
|
|
|
class TooManyRequests(ConnectionException):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def get_default_session_filename(username: str) -> str:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Returns default session filename for given username."""
|
2016-07-26 17:36:21 +02:00
|
|
|
dirname = tempfile.gettempdir() + "/" + ".instaloader-" + getpass.getuser()
|
|
|
|
filename = dirname + "/" + "session-" + username
|
2017-07-20 18:08:16 +02:00
|
|
|
return filename.lower()
|
2016-07-26 17:36:21 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2016-12-22 13:20:41 +01:00
|
|
|
def copy_session(session: requests.Session) -> requests.Session:
|
2016-09-18 16:35:25 +02:00
|
|
|
"""Duplicates a requests.Session."""
|
2016-07-28 15:45:31 +02:00
|
|
|
new = requests.Session()
|
|
|
|
new.cookies = \
|
2017-06-24 22:43:40 +02:00
|
|
|
requests.utils.cookiejar_from_dict(requests.utils.dict_from_cookiejar(session.cookies))
|
2017-07-29 01:54:42 +02:00
|
|
|
new.headers = session.headers.copy()
|
2016-07-28 15:45:31 +02:00
|
|
|
return new
|
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2017-07-20 11:25:46 +02:00
|
|
|
def default_user_agent() -> str:
|
|
|
|
return 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' \
|
|
|
|
'(KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36'
|
2016-07-25 23:43:41 +02:00
|
|
|
|
2016-09-18 16:35:25 +02:00
|
|
|
|
2017-07-06 22:26:25 +02:00
|
|
|
def shortcode_to_mediaid(code: str) -> int:
|
|
|
|
if len(code) > 11:
|
2017-07-13 22:33:01 +02:00
|
|
|
raise InvalidArgumentException("Wrong shortcode \"{0}\", unable to convert to mediaid.".format(code))
|
2017-07-06 22:26:25 +02:00
|
|
|
code = 'A' * (12 - len(code)) + code
|
|
|
|
return int.from_bytes(b64decode(code.encode(), b'-_'), 'big')
|
|
|
|
|
|
|
|
|
|
|
|
def mediaid_to_shortcode(mediaid: int) -> str:
|
|
|
|
if mediaid.bit_length() > 64:
|
2017-07-13 22:33:01 +02:00
|
|
|
raise InvalidArgumentException("Wrong mediaid {0}, unable to convert to shortcode".format(str(mediaid)))
|
2017-07-06 22:26:25 +02:00
|
|
|
return b64encode(mediaid.to_bytes(9, 'big'), b'-_').decode().replace('A', ' ').lstrip().replace(' ','A')
|
|
|
|
|
|
|
|
|
2017-07-25 18:31:08 +02:00
|
|
|
def format_string_contains_key(format_string: str, key: str) -> bool:
|
|
|
|
# pylint:disable=unused-variable
|
|
|
|
for literal_text, field_name, format_spec, conversion in string.Formatter().parse(format_string):
|
2017-11-08 15:58:33 +01:00
|
|
|
if field_name == key or field_name.startswith(key + '.'):
|
2017-07-25 18:31:08 +02:00
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2017-08-19 16:14:18 +02:00
|
|
|
def filterstr_to_filterfunc(filter_str: str, logged_in: bool) -> Callable[['Post'], bool]:
|
|
|
|
"""Takes an --only-if=... filter specification and makes a filter_func Callable out of it."""
|
|
|
|
|
2017-08-23 15:33:35 +02:00
|
|
|
# The filter_str is parsed, then all names occurring in its AST are replaced by loads to post.<name>. A
|
|
|
|
# function Post->bool is returned which evaluates the filter with the post as 'post' in its namespace.
|
|
|
|
|
|
|
|
class TransformFilterAst(ast.NodeTransformer):
|
2017-08-19 16:14:18 +02:00
|
|
|
def visit_Name(self, node: ast.Name):
|
2017-08-23 15:33:35 +02:00
|
|
|
# pylint:disable=invalid-name,no-self-use
|
2017-08-19 16:14:18 +02:00
|
|
|
if not isinstance(node.ctx, ast.Load):
|
|
|
|
raise InvalidArgumentException("Invalid filter: Modifying variables ({}) not allowed.".format(node.id))
|
|
|
|
if not hasattr(Post, node.id):
|
|
|
|
raise InvalidArgumentException("Invalid filter: Name {} is not defined.".format(node.id))
|
|
|
|
if node.id in Post.LOGIN_REQUIRING_PROPERTIES and not logged_in:
|
|
|
|
raise InvalidArgumentException("Invalid filter: Name {} requires being logged in.".format(node.id))
|
2017-08-23 15:33:35 +02:00
|
|
|
new_node = ast.Attribute(ast.copy_location(ast.Name('post', ast.Load()), node), node.id,
|
|
|
|
ast.copy_location(ast.Load(), node))
|
|
|
|
return ast.copy_location(new_node, node)
|
2017-08-19 16:14:18 +02:00
|
|
|
|
2017-08-23 15:33:35 +02:00
|
|
|
input_filename = '<--only-if parameter>'
|
|
|
|
compiled_filter = compile(TransformFilterAst().visit(ast.parse(filter_str, filename=input_filename, mode='eval')),
|
|
|
|
filename=input_filename, mode='eval')
|
2017-08-19 16:14:18 +02:00
|
|
|
|
|
|
|
def filterfunc(post: 'Post') -> bool:
|
|
|
|
# pylint:disable=eval-used
|
2017-08-23 15:33:35 +02:00
|
|
|
return bool(eval(compiled_filter, {'post': post}))
|
2017-08-19 16:14:18 +02:00
|
|
|
|
|
|
|
return filterfunc
|
|
|
|
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
class Post:
|
|
|
|
"""
|
|
|
|
Structure containing information about an Instagram post.
|
|
|
|
|
2017-09-08 10:35:38 +02:00
|
|
|
Created by Instaloader methods :meth:`.get_profile_posts`, :meth:`.get_hashtag_posts`, :meth:`.get_feed_posts`.
|
|
|
|
Posts are linked to an :class:`Instaloader` instance which is used for error logging and obtaining of additional
|
|
|
|
metadata, if required. This class unifies access to the properties associated with a post. It implements == and is
|
|
|
|
hashable.
|
|
|
|
|
2017-09-29 14:02:58 +02:00
|
|
|
The properties defined here are accessible by the filter expressions specified with the :option:`--only-if`
|
|
|
|
parameter and exported into JSON files with :option:`--metadata-json`.
|
2017-08-19 12:58:28 +02:00
|
|
|
"""
|
|
|
|
|
2017-08-19 16:14:18 +02:00
|
|
|
LOGIN_REQUIRING_PROPERTIES = ["viewer_has_liked"]
|
|
|
|
|
2017-11-08 15:58:33 +01:00
|
|
|
def __init__(self, instaloader: 'Instaloader', node: Dict[str, Any],
|
|
|
|
profile: Optional[str] = None, profile_id: Optional[int] = None):
|
2017-08-19 12:58:28 +02:00
|
|
|
"""Create a Post instance from a node structure as returned by Instagram.
|
|
|
|
|
2017-09-08 10:35:38 +02:00
|
|
|
:param instaloader: :class:`Instaloader` instance used for additional queries if neccessary.
|
2017-08-19 12:58:28 +02:00
|
|
|
:param node: Node structure.
|
|
|
|
:param profile: The name of the owner, if already known at creation.
|
|
|
|
"""
|
|
|
|
self._instaloader = instaloader
|
|
|
|
self._node = node
|
|
|
|
self._profile = profile
|
2017-11-08 15:58:33 +01:00
|
|
|
self._profile_id = profile_id
|
2017-08-19 12:58:28 +02:00
|
|
|
self._full_metadata_dict = None
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_shortcode(cls, instaloader: 'Instaloader', shortcode: str):
|
|
|
|
"""Create a post object from a given shortcode"""
|
|
|
|
# pylint:disable=protected-access
|
|
|
|
post = cls(instaloader, {'shortcode': shortcode})
|
|
|
|
post._node = post._full_metadata
|
|
|
|
return post
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_mediaid(cls, instaloader: 'Instaloader', mediaid: int):
|
|
|
|
"""Create a post object from a given mediaid"""
|
|
|
|
return cls.from_shortcode(instaloader, mediaid_to_shortcode(mediaid))
|
|
|
|
|
|
|
|
@property
|
|
|
|
def shortcode(self) -> str:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""Media shortcode. URL of the post is instagram.com/p/<shortcode>/."""
|
2017-08-19 12:58:28 +02:00
|
|
|
return self._node['shortcode'] if 'shortcode' in self._node else self._node['code']
|
|
|
|
|
2017-11-08 15:58:33 +01:00
|
|
|
@property
|
|
|
|
def mediaid(self) -> int:
|
|
|
|
"""The mediaid is a decimal representation of the media shortcode."""
|
|
|
|
return int(self._node['id'])
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
def __repr__(self):
|
|
|
|
return '<Post {}>'.format(self.shortcode)
|
|
|
|
|
|
|
|
def __eq__(self, o: object) -> bool:
|
|
|
|
if isinstance(o, Post):
|
|
|
|
return self.shortcode == o.shortcode
|
|
|
|
return NotImplemented
|
|
|
|
|
|
|
|
def __hash__(self) -> int:
|
|
|
|
return hash(self.shortcode)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def _full_metadata(self) -> Dict[str, Any]:
|
|
|
|
if not self._full_metadata_dict:
|
|
|
|
pic_json = self._instaloader.get_json("p/{0}/".format(self.shortcode), params={'__a': 1})
|
|
|
|
if "graphql" in pic_json:
|
|
|
|
self._full_metadata_dict = pic_json["graphql"]["shortcode_media"]
|
|
|
|
else:
|
|
|
|
self._full_metadata_dict = pic_json["media"]
|
|
|
|
return self._full_metadata_dict
|
|
|
|
|
2017-08-20 10:33:35 +02:00
|
|
|
def _field(self, *keys) -> Any:
|
|
|
|
"""Lookups given fields in _node, and if not found in _full_metadata. Raises KeyError if not found anywhere."""
|
|
|
|
# pylint:disable=invalid-name
|
|
|
|
try:
|
|
|
|
d = self._node
|
|
|
|
for key in keys:
|
|
|
|
d = d[key]
|
|
|
|
return d
|
|
|
|
except KeyError:
|
|
|
|
d = self._full_metadata
|
|
|
|
for key in keys:
|
|
|
|
d = d[key]
|
|
|
|
return d
|
2017-08-19 16:14:18 +02:00
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
@property
|
|
|
|
def owner_username(self) -> str:
|
|
|
|
"""The Post's lowercase owner name, or 'UNKNOWN'."""
|
|
|
|
try:
|
|
|
|
if self._profile:
|
|
|
|
return self._profile.lower()
|
2017-08-20 10:33:35 +02:00
|
|
|
return self._field('owner', 'username').lower()
|
2017-08-19 12:58:28 +02:00
|
|
|
except (InstaloaderException, KeyError, TypeError) as err:
|
|
|
|
self._instaloader.error("Get owner name of {}: {} -- using \'UNKNOWN\'.".format(self, err))
|
|
|
|
return 'UNKNOWN'
|
|
|
|
|
2017-11-08 15:58:33 +01:00
|
|
|
@property
|
|
|
|
def owner_id(self) -> int:
|
|
|
|
"""The ID of the Post's owner."""
|
|
|
|
if self._profile_id:
|
|
|
|
return self._profile_id
|
|
|
|
return int(self._field('owner', 'id'))
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
@property
|
|
|
|
def date(self) -> datetime:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""Timestamp when the post was created."""
|
2017-08-19 12:58:28 +02:00
|
|
|
return datetime.fromtimestamp(self._node["date"] if "date" in self._node else self._node["taken_at_timestamp"])
|
|
|
|
|
|
|
|
@property
|
|
|
|
def url(self) -> str:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""URL of the picture / video thumbnail of the post"""
|
2017-08-19 12:58:28 +02:00
|
|
|
return self._node["display_url"] if "display_url" in self._node else self._node["display_src"]
|
|
|
|
|
|
|
|
@property
|
|
|
|
def typename(self) -> str:
|
|
|
|
"""Type of post, GraphImage, GraphVideo or GraphSidecar"""
|
|
|
|
if '__typename' in self._node:
|
|
|
|
return self._node['__typename']
|
|
|
|
# if __typename is not in node, it is an old image or video
|
|
|
|
return 'GraphImage'
|
|
|
|
|
2017-08-20 10:33:35 +02:00
|
|
|
def get_sidecar_edges(self) -> List[Dict[str, Any]]:
|
|
|
|
return self._field('edge_sidecar_to_children', 'edges')
|
2017-08-19 12:58:28 +02:00
|
|
|
|
|
|
|
@property
|
|
|
|
def caption(self) -> Optional[str]:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""Caption."""
|
2017-08-19 12:58:28 +02:00
|
|
|
if "edge_media_to_caption" in self._node and self._node["edge_media_to_caption"]["edges"]:
|
|
|
|
return self._node["edge_media_to_caption"]["edges"][0]["node"]["text"]
|
|
|
|
elif "caption" in self._node:
|
|
|
|
return self._node["caption"]
|
|
|
|
|
2017-08-29 11:03:12 +02:00
|
|
|
@property
|
|
|
|
def caption_hashtags(self) -> List[str]:
|
2017-08-30 10:02:45 +02:00
|
|
|
"""List of all lowercased hashtags (without preceeding #) that occur in the Post's caption."""
|
2017-08-29 11:03:12 +02:00
|
|
|
if not self.caption:
|
|
|
|
return []
|
|
|
|
# This regular expression is from jStassen, adjusted to use Python's \w to support Unicode
|
|
|
|
# http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
|
|
|
|
hashtag_regex = re.compile(r"(?:#)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)")
|
2017-08-30 10:02:45 +02:00
|
|
|
return re.findall(hashtag_regex, self.caption.lower())
|
2017-08-29 11:03:12 +02:00
|
|
|
|
2017-09-10 12:37:15 +02:00
|
|
|
@property
|
|
|
|
def caption_mentions(self) -> List[str]:
|
|
|
|
"""List of all lowercased profiles that are mentioned in the Post's caption, without preceeding @."""
|
|
|
|
if not self.caption:
|
|
|
|
return []
|
|
|
|
# This regular expression is from jStassen, adjusted to use Python's \w to support Unicode
|
|
|
|
# http://blog.jstassen.com/2016/03/code-regex-for-instagram-username-and-hashtags/
|
|
|
|
mention_regex = re.compile(r"(?:@)(\w(?:(?:\w|(?:\.(?!\.))){0,28}(?:\w))?)")
|
|
|
|
return re.findall(mention_regex, self.caption.lower())
|
|
|
|
|
|
|
|
@property
|
|
|
|
def tagged_users(self) -> List[str]:
|
|
|
|
"""List of all lowercased users that are tagged in the Post."""
|
|
|
|
try:
|
|
|
|
return [edge['node']['user']['username' ].lower() for edge in self._field('edge_media_to_tagged_user',
|
|
|
|
'edges')]
|
|
|
|
except KeyError:
|
|
|
|
return []
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
@property
|
|
|
|
def is_video(self) -> bool:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""True if the Post is a video."""
|
2017-08-19 12:58:28 +02:00
|
|
|
return self._node['is_video']
|
|
|
|
|
|
|
|
@property
|
2017-08-20 10:33:35 +02:00
|
|
|
def video_url(self) -> Optional[str]:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""URL of the video, or None."""
|
2017-08-20 10:33:35 +02:00
|
|
|
if self.is_video:
|
|
|
|
return self._field('video_url')
|
2017-08-19 12:58:28 +02:00
|
|
|
|
|
|
|
@property
|
2017-08-20 10:33:35 +02:00
|
|
|
def viewer_has_liked(self) -> Optional[bool]:
|
|
|
|
"""Whether the viewer has liked the post, or None if not logged in."""
|
2017-08-19 12:58:28 +02:00
|
|
|
if not self._instaloader.is_logged_in:
|
2017-08-20 10:33:35 +02:00
|
|
|
return None
|
2017-08-19 16:14:18 +02:00
|
|
|
if 'likes' in self._node and 'viewer_has_liked' in self._node['likes']:
|
2017-08-19 12:58:28 +02:00
|
|
|
return self._node['likes']['viewer_has_liked']
|
2017-08-20 10:33:35 +02:00
|
|
|
return self._field('viewer_has_liked')
|
2017-08-19 16:14:18 +02:00
|
|
|
|
|
|
|
@property
|
|
|
|
def likes(self) -> int:
|
|
|
|
"""Likes count"""
|
2017-08-20 10:33:35 +02:00
|
|
|
return self._field('edge_media_preview_like', 'count')
|
2017-08-19 16:14:18 +02:00
|
|
|
|
|
|
|
@property
|
|
|
|
def comments(self) -> int:
|
|
|
|
"""Comment count"""
|
2017-08-20 10:33:35 +02:00
|
|
|
return self._field('edge_media_to_comment', 'count')
|
2017-08-19 12:58:28 +02:00
|
|
|
|
|
|
|
def get_comments(self) -> Iterator[Dict[str, Any]]:
|
2017-08-19 16:14:18 +02:00
|
|
|
"""Iterate over all comments of the post."""
|
2017-08-20 10:33:35 +02:00
|
|
|
if self.comments == 0:
|
|
|
|
# Avoid doing additional requests if there are no comments
|
|
|
|
return
|
|
|
|
comment_edges = self._field('edge_media_to_comment', 'edges')
|
|
|
|
if self.comments == len(comment_edges):
|
2017-08-19 12:58:28 +02:00
|
|
|
# If the Post's metadata already contains all comments, don't do GraphQL requests to obtain them
|
2017-08-20 10:33:35 +02:00
|
|
|
yield from (comment['node'] for comment in comment_edges)
|
2017-08-19 12:58:28 +02:00
|
|
|
yield from self._instaloader.graphql_node_list(17852405266163336, {'shortcode': self.shortcode},
|
|
|
|
'https://www.instagram.com/p/' + self.shortcode + '/',
|
|
|
|
lambda d: d['data']['shortcode_media']['edge_media_to_comment'])
|
|
|
|
|
|
|
|
def get_location(self) -> Optional[Dict[str, str]]:
|
|
|
|
"""If the Post has a location, returns a dictionary with fields 'lat' and 'lng'."""
|
2017-08-20 10:33:35 +02:00
|
|
|
loc_dict = self._field("location")
|
2017-08-19 12:58:28 +02:00
|
|
|
if loc_dict is not None:
|
|
|
|
location_json = self._instaloader.get_json("explore/locations/{0}/".format(loc_dict["id"]),
|
|
|
|
params={'__a': 1})
|
|
|
|
return location_json["location"]
|
|
|
|
|
2017-09-29 14:02:58 +02:00
|
|
|
@staticmethod
|
|
|
|
def json_encoder(obj) -> Dict[str, Any]:
|
|
|
|
"""Convert instance of :class:`Post` to a JSON-serializable dictionary."""
|
|
|
|
if not isinstance(obj, Post):
|
|
|
|
raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__))
|
|
|
|
jsondict = {}
|
|
|
|
for prop in dir(Post):
|
|
|
|
if prop[0].isupper() or prop[0] == '_':
|
|
|
|
# skip uppercase and private properties
|
|
|
|
continue
|
|
|
|
val = obj.__getattribute__(prop)
|
|
|
|
if val is True or val is False or isinstance(val, (str, int, float, list)):
|
|
|
|
jsondict[prop] = val
|
|
|
|
elif isinstance(val, datetime):
|
|
|
|
jsondict[prop] = val.isoformat()
|
|
|
|
return jsondict
|
2017-08-19 12:58:28 +02:00
|
|
|
|
2017-08-11 19:51:00 +02:00
|
|
|
class Tristate(Enum):
|
|
|
|
"""Tri-state to encode whether we should save certain information, i.e. videos, captions, comments or geotags.
|
|
|
|
|
2017-09-08 10:35:38 +02:00
|
|
|
:attr:`never`
|
|
|
|
Do not save, even if the information is available without any additional request,
|
|
|
|
|
|
|
|
:attr:`no_extra_query`
|
|
|
|
Save if and only if available without doing additional queries,
|
|
|
|
|
|
|
|
:attr:`always`
|
|
|
|
Save (and query, if neccessary).
|
2017-08-11 19:51:00 +02:00
|
|
|
"""
|
|
|
|
never = 0
|
|
|
|
no_extra_query = 1
|
|
|
|
always = 2
|
|
|
|
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
class Instaloader:
|
|
|
|
def __init__(self,
|
2017-08-24 16:03:24 +02:00
|
|
|
sleep: bool = True, quiet: bool = False,
|
2017-07-25 18:31:08 +02:00
|
|
|
user_agent: Optional[str] = None,
|
|
|
|
dirname_pattern: Optional[str] = None,
|
2017-08-11 19:51:00 +02:00
|
|
|
filename_pattern: Optional[str] = None,
|
|
|
|
download_videos: Tristate = Tristate.always,
|
2017-12-14 16:46:19 +01:00
|
|
|
download_video_thumbnails: Tristate = Tristate.always,
|
2017-08-11 19:51:00 +02:00
|
|
|
download_geotags: Tristate = Tristate.no_extra_query,
|
2017-09-29 14:02:58 +02:00
|
|
|
save_captions: Tristate = Tristate.no_extra_query,
|
|
|
|
download_comments: Tristate = Tristate.no_extra_query,
|
2017-09-29 16:09:15 +02:00
|
|
|
save_metadata: Tristate = Tristate.never,
|
|
|
|
max_connection_attempts: int = 3):
|
2017-08-11 19:51:00 +02:00
|
|
|
|
|
|
|
# configuration parameters
|
2017-07-20 11:25:46 +02:00
|
|
|
self.user_agent = user_agent if user_agent is not None else default_user_agent()
|
2017-08-11 17:50:37 +02:00
|
|
|
self.session = self._get_anonymous_session()
|
2017-06-24 22:43:40 +02:00
|
|
|
self.username = None
|
|
|
|
self.sleep = sleep
|
|
|
|
self.quiet = quiet
|
2017-07-25 18:31:08 +02:00
|
|
|
self.dirname_pattern = dirname_pattern if dirname_pattern is not None else '{target}'
|
2017-07-26 19:13:56 +02:00
|
|
|
self.filename_pattern = filename_pattern.replace('{date}', '{date:%Y-%m-%d_%H-%M-%S}') \
|
|
|
|
if filename_pattern is not None else '{date:%Y-%m-%d_%H-%M-%S}'
|
2017-08-11 19:51:00 +02:00
|
|
|
self.download_videos = download_videos
|
2017-12-14 16:46:19 +01:00
|
|
|
self.download_video_thumbnails = download_video_thumbnails
|
2017-08-11 19:51:00 +02:00
|
|
|
self.download_geotags = download_geotags
|
2017-09-29 14:02:58 +02:00
|
|
|
self.save_captions = save_captions
|
2017-08-11 19:51:00 +02:00
|
|
|
self.download_comments = download_comments
|
2017-09-29 14:02:58 +02:00
|
|
|
self.save_metadata = save_metadata
|
2017-09-29 16:09:15 +02:00
|
|
|
self.max_connection_attempts = max_connection_attempts
|
2017-08-11 19:51:00 +02:00
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
# error log, filled with error() and printed at the end of Instaloader.main()
|
2017-08-06 19:27:46 +02:00
|
|
|
self.error_log = []
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-20 11:28:12 +02:00
|
|
|
# For the adaption of sleep intervals (rate control)
|
2017-09-29 14:02:58 +02:00
|
|
|
self.previous_queries = dict()
|
2017-08-20 11:28:12 +02:00
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
@property
|
|
|
|
def is_logged_in(self) -> bool:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""True, if this Instaloader instance is logged in."""
|
2017-08-19 12:58:28 +02:00
|
|
|
return bool(self.username)
|
|
|
|
|
2017-08-11 19:51:00 +02:00
|
|
|
@contextmanager
|
|
|
|
def anonymous_copy(self):
|
|
|
|
"""Yield an anonymous, otherwise equally-configured copy of an Instaloader instance; Then copy its error log."""
|
2017-08-24 16:03:24 +02:00
|
|
|
new_loader = Instaloader(self.sleep, self.quiet, self.user_agent,
|
2017-08-11 19:51:00 +02:00
|
|
|
self.dirname_pattern, self.filename_pattern,
|
2017-12-14 16:46:19 +01:00
|
|
|
self.download_videos,
|
|
|
|
self.download_video_thumbnails,
|
|
|
|
self.download_geotags,
|
2017-09-29 16:09:15 +02:00
|
|
|
self.save_captions, self.download_comments,
|
|
|
|
self.save_metadata, self.max_connection_attempts)
|
2017-08-24 18:30:46 +02:00
|
|
|
new_loader.previous_queries = self.previous_queries
|
2017-08-11 19:51:00 +02:00
|
|
|
yield new_loader
|
|
|
|
self.error_log.extend(new_loader.error_log)
|
2017-08-24 18:30:46 +02:00
|
|
|
self.previous_queries = new_loader.previous_queries
|
2017-08-11 19:51:00 +02:00
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
def _log(self, *msg, sep='', end='\n', flush=False):
|
2017-08-06 19:27:46 +02:00
|
|
|
"""Log a message to stdout that can be suppressed with --quiet."""
|
2017-06-24 22:43:40 +02:00
|
|
|
if not self.quiet:
|
|
|
|
print(*msg, sep=sep, end=end, flush=flush)
|
|
|
|
|
2017-08-28 21:05:46 +02:00
|
|
|
def error(self, msg, repeat_at_end = True):
|
|
|
|
"""Log a non-fatal error message to stderr, which is repeated at program termination.
|
|
|
|
|
|
|
|
:param repeat_at_end: Set to false if the message should be printed, but not repeated at program termination."""
|
2017-08-06 19:27:46 +02:00
|
|
|
print(msg, file=sys.stderr)
|
2017-08-28 21:05:46 +02:00
|
|
|
if repeat_at_end:
|
|
|
|
self.error_log.append(msg)
|
2017-08-06 19:27:46 +02:00
|
|
|
|
|
|
|
@contextmanager
|
|
|
|
def _error_catcher(self, extra_info: Optional[str] = None):
|
|
|
|
"""
|
2017-08-11 17:50:37 +02:00
|
|
|
Context manager to catch, print and record InstaloaderExceptions.
|
2017-08-06 19:27:46 +02:00
|
|
|
|
|
|
|
:param extra_info: String to prefix error message with."""
|
|
|
|
try:
|
|
|
|
yield
|
2017-08-11 17:50:37 +02:00
|
|
|
except InstaloaderException as err:
|
2017-08-06 19:27:46 +02:00
|
|
|
if extra_info:
|
2017-08-19 12:58:28 +02:00
|
|
|
self.error('{}: {}'.format(extra_info, err))
|
2017-08-06 19:27:46 +02:00
|
|
|
else:
|
2017-08-19 12:58:28 +02:00
|
|
|
self.error('{}'.format(err))
|
2017-08-06 19:27:46 +02:00
|
|
|
|
2017-07-26 15:08:11 +02:00
|
|
|
def _sleep(self):
|
2017-08-20 11:28:12 +02:00
|
|
|
"""Sleep a short time if self.sleep is set. Called before each request to instagram.com."""
|
2017-08-24 18:30:46 +02:00
|
|
|
if self.sleep:
|
|
|
|
time.sleep(random.uniform(0.5, 3))
|
2017-07-26 15:08:11 +02:00
|
|
|
|
2017-09-29 16:09:15 +02:00
|
|
|
def _get_and_write_raw(self, url: str, filename: str, _attempt = 1) -> None:
|
2017-08-06 19:27:46 +02:00
|
|
|
"""Downloads raw data.
|
|
|
|
|
2017-08-13 12:39:59 +02:00
|
|
|
:raises QueryReturnedNotFoundException: When the server responds with a 404.
|
2017-08-06 19:27:46 +02:00
|
|
|
:raises ConnectionException: When download repeatedly failed."""
|
2017-07-31 20:34:27 +02:00
|
|
|
try:
|
2017-08-11 17:50:37 +02:00
|
|
|
resp = self._get_anonymous_session().get(url, stream=True)
|
2017-07-31 20:34:27 +02:00
|
|
|
if resp.status_code == 200:
|
|
|
|
self._log(filename, end=' ', flush=True)
|
|
|
|
with open(filename, 'wb') as file:
|
|
|
|
resp.raw.decode_content = True
|
|
|
|
shutil.copyfileobj(resp.raw, file)
|
|
|
|
else:
|
2017-08-13 12:39:59 +02:00
|
|
|
if resp.status_code == 404:
|
|
|
|
# 404 not worth retrying.
|
|
|
|
raise QueryReturnedNotFoundException("404 when accessing {}.".format(url))
|
2017-08-06 19:27:46 +02:00
|
|
|
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
|
2017-08-01 16:30:59 +02:00
|
|
|
except (urllib3.exceptions.HTTPError, requests.exceptions.RequestException, ConnectionException) as err:
|
2017-08-06 19:27:46 +02:00
|
|
|
error_string = "URL {}: {}".format(url, err)
|
2017-09-29 16:09:15 +02:00
|
|
|
if _attempt == self.max_connection_attempts:
|
|
|
|
raise ConnectionException(error_string)
|
|
|
|
self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
|
|
|
|
try:
|
|
|
|
self._sleep()
|
|
|
|
self._get_and_write_raw(url, filename, _attempt + 1)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
self.error("[skipped by user]", repeat_at_end=False)
|
2017-08-06 19:27:46 +02:00
|
|
|
raise ConnectionException(error_string)
|
2017-07-31 20:34:27 +02:00
|
|
|
|
2017-08-24 18:30:46 +02:00
|
|
|
def get_json(self, url: str, params: Dict[str, Any],
|
2017-09-29 16:09:15 +02:00
|
|
|
session: Optional[requests.Session] = None, _attempt = 1) -> Dict[str, Any]:
|
2017-08-06 19:27:46 +02:00
|
|
|
"""JSON request to Instagram.
|
|
|
|
|
2017-08-30 09:50:26 +02:00
|
|
|
:param url: URL, relative to www.instagram.com/
|
2017-08-06 19:27:46 +02:00
|
|
|
:param params: GET parameters
|
|
|
|
:param session: Session to use, or None to use self.session
|
|
|
|
:return: Decoded response dictionary
|
2017-08-13 12:39:59 +02:00
|
|
|
:raises QueryReturnedNotFoundException: When the server responds with a 404.
|
|
|
|
:raises ConnectionException: When query repeatedly failed.
|
2017-08-06 19:27:46 +02:00
|
|
|
"""
|
2017-08-24 18:30:46 +02:00
|
|
|
def graphql_query_waittime(query_id: int, untracked_queries: bool = False) -> int:
|
|
|
|
sliding_window = 660
|
|
|
|
timestamps = self.previous_queries.get(query_id)
|
|
|
|
if not timestamps:
|
|
|
|
return sliding_window if untracked_queries else 0
|
|
|
|
current_time = time.monotonic()
|
|
|
|
timestamps = list(filter(lambda t: t > current_time - sliding_window, timestamps))
|
|
|
|
self.previous_queries[query_id] = timestamps
|
|
|
|
if len(timestamps) < 100 and not untracked_queries:
|
|
|
|
return 0
|
|
|
|
return round(min(timestamps) + sliding_window - current_time) + 6
|
|
|
|
is_graphql_query = 'query_id' in params and 'graphql/query' in url
|
|
|
|
if is_graphql_query:
|
|
|
|
query_id = params['query_id']
|
|
|
|
waittime = graphql_query_waittime(query_id)
|
|
|
|
if waittime > 0:
|
|
|
|
self._log('\nToo many queries in the last time. Need to wait {} seconds.'.format(waittime))
|
|
|
|
time.sleep(waittime)
|
|
|
|
timestamp_list = self.previous_queries.get(query_id)
|
|
|
|
if timestamp_list is not None:
|
|
|
|
timestamp_list.append(time.monotonic())
|
|
|
|
else:
|
|
|
|
self.previous_queries[query_id] = [time.monotonic()]
|
2017-08-06 19:27:46 +02:00
|
|
|
sess = session if session else self.session
|
|
|
|
try:
|
|
|
|
self._sleep()
|
|
|
|
resp = sess.get('https://www.instagram.com/' + url, params=params)
|
|
|
|
if resp.status_code == 404:
|
|
|
|
raise QueryReturnedNotFoundException("404")
|
2017-08-20 11:48:19 +02:00
|
|
|
if resp.status_code == 429:
|
|
|
|
raise TooManyRequests("429 - Too Many Requests")
|
2017-08-06 19:27:46 +02:00
|
|
|
if resp.status_code != 200:
|
|
|
|
raise ConnectionException("HTTP error code {}.".format(resp.status_code))
|
2017-08-13 10:55:29 +02:00
|
|
|
resp_json = resp.json()
|
|
|
|
if 'status' in resp_json and resp_json['status'] != "ok":
|
|
|
|
if 'message' in resp_json:
|
|
|
|
raise ConnectionException("Returned \"{}\" status, message \"{}\".".format(resp_json['status'],
|
|
|
|
resp_json['message']))
|
|
|
|
else:
|
|
|
|
raise ConnectionException("Returned \"{}\" status.".format(resp_json['status']))
|
|
|
|
return resp_json
|
2017-08-24 18:30:46 +02:00
|
|
|
except (ConnectionException, json.decoder.JSONDecodeError, requests.exceptions.RequestException) as err:
|
2017-08-06 19:27:46 +02:00
|
|
|
error_string = "JSON Query to {}: {}".format(url, err)
|
2017-09-29 16:09:15 +02:00
|
|
|
if _attempt == self.max_connection_attempts:
|
|
|
|
raise ConnectionException(error_string)
|
|
|
|
self.error(error_string + " [retrying; skip with ^C]", repeat_at_end=False)
|
|
|
|
text_for_429 = ("HTTP error code 429 was returned because too many queries occured in the last time. "
|
|
|
|
"Please do not use Instagram in your browser or run multiple instances of Instaloader "
|
|
|
|
"in parallel.")
|
|
|
|
try:
|
|
|
|
if isinstance(err, TooManyRequests):
|
|
|
|
print(textwrap.fill(text_for_429), file=sys.stderr)
|
|
|
|
if is_graphql_query:
|
|
|
|
waittime = graphql_query_waittime(query_id=params['query_id'], untracked_queries=True)
|
|
|
|
if waittime > 0:
|
|
|
|
self._log('The request will be retried in {} seconds.'.format(waittime))
|
|
|
|
time.sleep(waittime)
|
|
|
|
self._sleep()
|
|
|
|
return self.get_json(url, params, sess, _attempt + 1)
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
self.error("[skipped by user]", repeat_at_end=False)
|
2017-08-06 19:27:46 +02:00
|
|
|
raise ConnectionException(error_string)
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
def _default_http_header(self, empty_session_only: bool = False) -> Dict[str, str]:
|
2017-07-20 11:25:46 +02:00
|
|
|
"""Returns default HTTP header we use for requests."""
|
|
|
|
header = {'Accept-Encoding': 'gzip, deflate',
|
|
|
|
'Accept-Language': 'en-US,en;q=0.8',
|
|
|
|
'Connection': 'keep-alive',
|
|
|
|
'Content-Length': '0',
|
|
|
|
'Host': 'www.instagram.com',
|
|
|
|
'Origin': 'https://www.instagram.com',
|
|
|
|
'Referer': 'https://www.instagram.com/',
|
|
|
|
'User-Agent': self.user_agent,
|
|
|
|
'X-Instagram-AJAX': '1',
|
|
|
|
'X-Requested-With': 'XMLHttpRequest'}
|
|
|
|
if empty_session_only:
|
|
|
|
del header['Host']
|
|
|
|
del header['Origin']
|
|
|
|
del header['Referer']
|
|
|
|
del header['X-Instagram-AJAX']
|
|
|
|
del header['X-Requested-With']
|
|
|
|
return header
|
|
|
|
|
2017-08-11 17:50:37 +02:00
|
|
|
def _get_anonymous_session(self) -> requests.Session:
|
2017-07-20 11:25:46 +02:00
|
|
|
"""Returns our default anonymous requests.Session object."""
|
|
|
|
session = requests.Session()
|
|
|
|
session.cookies.update({'sessionid': '', 'mid': '', 'ig_pr': '1',
|
|
|
|
'ig_vw': '1920', 'csrftoken': '',
|
|
|
|
's_network': '', 'ds_user_id': ''})
|
2017-08-11 17:50:37 +02:00
|
|
|
session.headers.update(self._default_http_header(empty_session_only=True))
|
2017-07-20 11:25:46 +02:00
|
|
|
return session
|
|
|
|
|
2017-07-24 12:08:08 +02:00
|
|
|
def graphql_query(self, query_id: int, variables: Dict[str, Any],
|
|
|
|
referer: Optional[str] = None) -> Dict[str, Any]:
|
|
|
|
"""
|
|
|
|
Do a GraphQL Query.
|
|
|
|
|
|
|
|
:param query_id: Query ID.
|
|
|
|
:param variables: Variables for the Query.
|
|
|
|
:param referer: HTTP Referer, or None.
|
|
|
|
:return: The server's response dictionary.
|
|
|
|
"""
|
|
|
|
tmpsession = copy_session(self.session)
|
2017-08-11 17:50:37 +02:00
|
|
|
tmpsession.headers.update(self._default_http_header(empty_session_only=True))
|
2017-07-24 12:08:08 +02:00
|
|
|
del tmpsession.headers['Connection']
|
|
|
|
del tmpsession.headers['Content-Length']
|
|
|
|
tmpsession.headers['authority'] = 'www.instagram.com'
|
|
|
|
tmpsession.headers['scheme'] = 'https'
|
|
|
|
tmpsession.headers['accept'] = '*/*'
|
|
|
|
if referer is not None:
|
2017-08-19 22:44:08 +02:00
|
|
|
tmpsession.headers['referer'] = urllib.parse.quote(referer)
|
2017-08-19 12:58:28 +02:00
|
|
|
resp_json = self.get_json('graphql/query', params={'query_id': query_id,
|
|
|
|
'variables': json.dumps(variables, separators=(',', ':'))},
|
|
|
|
session=tmpsession)
|
2017-08-13 10:55:29 +02:00
|
|
|
if 'status' not in resp_json:
|
2017-08-19 12:58:28 +02:00
|
|
|
self.error("GraphQL response did not contain a \"status\" field.")
|
2017-08-13 10:55:29 +02:00
|
|
|
return resp_json
|
2017-07-24 12:08:08 +02:00
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
def get_username_by_id(self, profile_id: int) -> str:
|
2017-07-14 05:18:18 +02:00
|
|
|
"""To get the current username of a profile, given its unique ID, this function can be used."""
|
2017-07-24 12:08:08 +02:00
|
|
|
data = self.graphql_query(17862015703145017, {'id': str(profile_id), 'first': 1})['data']['user']
|
|
|
|
if data:
|
|
|
|
data = data["edge_owner_to_timeline_media"]
|
|
|
|
else:
|
|
|
|
raise ProfileNotExistsException("No profile found, the user may have blocked you (ID: " +
|
|
|
|
str(profile_id) + ").")
|
|
|
|
if not data['edges']:
|
|
|
|
if data['count'] == 0:
|
|
|
|
raise ProfileHasNoPicsException("Profile with ID {0}: no pics found.".format(str(profile_id)))
|
2017-07-14 05:18:18 +02:00
|
|
|
else:
|
2017-07-24 12:08:08 +02:00
|
|
|
raise LoginRequiredException("Login required to determine username (ID: " + str(profile_id) + ").")
|
2017-04-21 18:01:20 +02:00
|
|
|
else:
|
2017-08-19 12:58:28 +02:00
|
|
|
return Post.from_mediaid(self, int(data['edges'][0]["node"]["id"])).owner_username
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
def get_id_by_username(self, profile: str) -> int:
|
|
|
|
"""Each Instagram profile has its own unique ID which stays unmodified even if a user changes
|
|
|
|
his/her username. To get said ID, given the profile's name, you may call this function."""
|
2017-08-06 19:27:46 +02:00
|
|
|
return int(self.get_profile_metadata(profile)['user']['id'])
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def graphql_node_list(self, query_id: int, query_variables: Dict[str, Any], query_referer: Optional[str],
|
|
|
|
edge_extractor: Callable[[Dict[str, Any]], Dict[str, Any]]) -> Iterator[Dict[str, Any]]:
|
2017-08-28 21:45:43 +02:00
|
|
|
"""Retrieve a list of GraphQL nodes."""
|
2017-12-11 21:11:18 +01:00
|
|
|
query_variables['first'] = 200
|
2017-08-06 19:27:46 +02:00
|
|
|
data = self.graphql_query(query_id, query_variables, query_referer)
|
|
|
|
while True:
|
|
|
|
edge_struct = edge_extractor(data)
|
|
|
|
yield from [edge['node'] for edge in edge_struct['edges']]
|
|
|
|
if edge_struct['page_info']['has_next_page']:
|
|
|
|
query_variables['after'] = edge_struct['page_info']['end_cursor']
|
|
|
|
data = self.graphql_query(query_id, query_variables, query_referer)
|
|
|
|
else:
|
|
|
|
break
|
|
|
|
|
|
|
|
def get_followers(self, profile: str) -> Iterator[Dict[str, Any]]:
|
2017-07-20 17:57:12 +02:00
|
|
|
"""
|
|
|
|
Retrieve list of followers of given profile.
|
|
|
|
To use this, one needs to be logged in and private profiles has to be followed,
|
|
|
|
otherwise this returns an empty list.
|
|
|
|
|
|
|
|
:param profile: Name of profile to lookup followers.
|
|
|
|
"""
|
2017-08-06 19:27:46 +02:00
|
|
|
yield from self.graphql_node_list(17851374694183129, {'id': str(self.get_id_by_username(profile))},
|
|
|
|
'https://www.instagram.com/' + profile + '/',
|
|
|
|
lambda d: d['data']['user']['edge_followed_by'])
|
2017-07-20 17:57:12 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def get_followees(self, profile: str) -> Iterator[Dict[str, Any]]:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
2017-07-20 17:57:12 +02:00
|
|
|
Retrieve list of followees (followings) of given profile.
|
|
|
|
To use this, one needs to be logged in and private profiles has to be followed,
|
|
|
|
otherwise this returns an empty list.
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-07-20 17:57:12 +02:00
|
|
|
:param profile: Name of profile to lookup followers.
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
2017-08-06 19:27:46 +02:00
|
|
|
yield from self.graphql_node_list(17874545323001329, {'id': str(self.get_id_by_username(profile))},
|
|
|
|
'https://www.instagram.com/' + profile + '/',
|
|
|
|
lambda d: d['data']['user']['edge_follow'])
|
2017-07-20 22:30:12 +02:00
|
|
|
|
2017-07-29 11:08:52 +02:00
|
|
|
def download_pic(self, filename: str, url: str, mtime: datetime,
|
2017-06-24 22:43:40 +02:00
|
|
|
filename_suffix: Optional[str] = None) -> bool:
|
|
|
|
"""Downloads and saves picture with given url under given directory with given timestamp.
|
|
|
|
Returns true, if file was actually downloaded, i.e. updated."""
|
2017-11-08 13:45:29 +01:00
|
|
|
urlmatch = re.search('\\.[a-z0-9]*\\?', url)
|
2017-06-24 22:43:40 +02:00
|
|
|
file_extension = url[-3:] if urlmatch is None else urlmatch.group(0)[1:-1]
|
|
|
|
if filename_suffix is not None:
|
|
|
|
filename += '_' + filename_suffix
|
|
|
|
filename += '.' + file_extension
|
|
|
|
if os.path.isfile(filename):
|
2017-07-25 18:31:08 +02:00
|
|
|
self._log(filename + ' exists', end=' ', flush=True)
|
2017-06-24 22:43:40 +02:00
|
|
|
return False
|
2017-07-31 20:34:27 +02:00
|
|
|
self._get_and_write_raw(url, filename)
|
|
|
|
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
|
|
|
|
return True
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-09-29 14:02:58 +02:00
|
|
|
def save_metadata_json(self, filename: str, post: Post) -> None:
|
|
|
|
"""Saves metadata JSON file of a :class:`Post`."""
|
|
|
|
filename += '.json'
|
|
|
|
json.dump(post, fp=open(filename, 'w'), indent=4, default=Post.json_encoder)
|
|
|
|
self._log('json', end=' ', flush=True)
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
def update_comments(self, filename: str, post: Post) -> None:
|
2017-07-25 18:31:08 +02:00
|
|
|
filename += '_comments.json'
|
2017-07-20 22:30:12 +02:00
|
|
|
try:
|
|
|
|
comments = json.load(open(filename))
|
|
|
|
except FileNotFoundError:
|
|
|
|
comments = list()
|
2017-08-19 12:58:28 +02:00
|
|
|
comments.extend(post.get_comments())
|
2017-07-20 22:30:12 +02:00
|
|
|
if comments:
|
|
|
|
with open(filename, 'w') as file:
|
|
|
|
comments_list = sorted(sorted(list(comments), key=lambda t: t['id']),
|
|
|
|
key=lambda t: t['created_at'], reverse=True)
|
|
|
|
unique_comments_list = [comments_list[0]]
|
|
|
|
#for comment in comments_list:
|
|
|
|
# if unique_comments_list[-1]['id'] != comment['id']:
|
|
|
|
# unique_comments_list.append(comment)
|
|
|
|
#file.write(json.dumps(unique_comments_list, indent=4))
|
|
|
|
#pylint:disable=invalid-name
|
|
|
|
for x, y in zip(comments_list[:-1], comments_list[1:]):
|
|
|
|
if x['id'] != y['id']:
|
|
|
|
unique_comments_list.append(y)
|
|
|
|
file.write(json.dumps(unique_comments_list, indent=4))
|
|
|
|
self._log('comments', end=' ', flush=True)
|
|
|
|
|
2017-07-29 11:08:52 +02:00
|
|
|
def save_caption(self, filename: str, mtime: datetime, caption: str) -> None:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""Updates picture caption"""
|
2017-07-25 18:31:08 +02:00
|
|
|
filename += '.txt'
|
2017-06-24 22:43:40 +02:00
|
|
|
pcaption = caption.replace('\n', ' ').strip()
|
|
|
|
caption = caption.encode("UTF-8")
|
2017-08-24 16:03:24 +02:00
|
|
|
pcaption = '[' + ((pcaption[:29] + u"\u2026") if len(pcaption) > 31 else pcaption) + ']'
|
2017-08-06 19:27:46 +02:00
|
|
|
with suppress(FileNotFoundError):
|
2017-06-24 22:43:40 +02:00
|
|
|
with open(filename, 'rb') as file:
|
|
|
|
file_caption = file.read()
|
|
|
|
if file_caption.replace(b'\r\n', b'\n') == caption.replace(b'\r\n', b'\n'):
|
|
|
|
try:
|
|
|
|
self._log(pcaption + ' unchanged', end=' ', flush=True)
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
self._log('txt unchanged', end=' ', flush=True)
|
|
|
|
return None
|
|
|
|
else:
|
|
|
|
def get_filename(index):
|
|
|
|
return filename if index == 0 else (filename[:-4] + '_old_' +
|
|
|
|
(str(0) if index < 10 else str()) + str(index) + filename[-4:])
|
|
|
|
|
|
|
|
i = 0
|
|
|
|
while os.path.isfile(get_filename(i)):
|
|
|
|
i = i + 1
|
|
|
|
for index in range(i, 0, -1):
|
|
|
|
os.rename(get_filename(index - 1), get_filename(index))
|
|
|
|
try:
|
|
|
|
self._log(pcaption + ' updated', end=' ', flush=True)
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
self._log('txt updated', end=' ', flush=True)
|
|
|
|
try:
|
|
|
|
self._log(pcaption, end=' ', flush=True)
|
|
|
|
except UnicodeEncodeError:
|
|
|
|
self._log('txt', end=' ', flush=True)
|
|
|
|
with open(filename, 'wb') as text_file:
|
|
|
|
shutil.copyfileobj(BytesIO(caption), text_file)
|
2017-07-29 11:08:52 +02:00
|
|
|
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
|
2017-04-17 12:10:43 +02:00
|
|
|
|
2017-07-29 11:08:52 +02:00
|
|
|
def save_location(self, filename: str, location_json: Dict[str, str], mtime: datetime) -> None:
|
2017-07-25 18:31:08 +02:00
|
|
|
filename += '_location.txt'
|
2017-06-24 22:43:40 +02:00
|
|
|
location_string = (location_json["name"] + "\n" +
|
|
|
|
"https://maps.google.com/maps?q={0},{1}&ll={0},{1}\n".format(location_json["lat"],
|
|
|
|
location_json["lng"]))
|
|
|
|
with open(filename, 'wb') as text_file:
|
|
|
|
shutil.copyfileobj(BytesIO(location_string.encode()), text_file)
|
2017-07-29 11:08:52 +02:00
|
|
|
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
|
2017-06-24 22:43:40 +02:00
|
|
|
self._log('geo', end=' ', flush=True)
|
|
|
|
|
|
|
|
def download_profilepic(self, name: str, url: str) -> None:
|
|
|
|
"""Downloads and saves profile pic with given url."""
|
2017-07-29 11:08:52 +02:00
|
|
|
|
|
|
|
def _epoch_to_string(epoch: datetime) -> str:
|
|
|
|
return epoch.strftime('%Y-%m-%d_%H-%M-%S')
|
|
|
|
|
2017-08-19 15:17:43 +02:00
|
|
|
date_object = datetime.strptime(self._get_anonymous_session().head(url).headers["Last-Modified"],
|
2017-07-29 11:08:52 +02:00
|
|
|
'%a, %d %b %Y %H:%M:%S GMT')
|
2017-07-25 18:31:08 +02:00
|
|
|
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
|
|
|
format_string_contains_key(self.dirname_pattern, 'target'))):
|
|
|
|
filename = '{0}/{1}_UTC_profile_pic.{2}'.format(self.dirname_pattern.format(profile=name.lower(),
|
|
|
|
target=name.lower()),
|
2017-07-29 11:08:52 +02:00
|
|
|
_epoch_to_string(date_object), url[-3:])
|
2017-06-25 14:53:43 +02:00
|
|
|
else:
|
2017-07-25 18:31:08 +02:00
|
|
|
filename = '{0}/{1}_{2}_UTC_profile_pic.{3}'.format(self.dirname_pattern.format(), name.lower(),
|
2017-07-29 11:08:52 +02:00
|
|
|
_epoch_to_string(date_object), url[-3:])
|
2017-06-24 22:43:40 +02:00
|
|
|
if os.path.isfile(filename):
|
|
|
|
self._log(filename + ' already exists')
|
|
|
|
return None
|
2017-08-13 23:37:53 +02:00
|
|
|
url = re.sub(r'/s([1-9][0-9]{2})x\1/', '/s2048x2048/', url)
|
2017-07-31 20:34:27 +02:00
|
|
|
self._get_and_write_raw(url, filename)
|
|
|
|
os.utime(filename, (datetime.now().timestamp(), date_object.timestamp()))
|
2017-08-06 19:27:46 +02:00
|
|
|
self._log('') # log output of _get_and_write_raw() does not produce \n
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
def save_session_to_file(self, filename: Optional[str] = None) -> None:
|
2017-09-08 10:35:38 +02:00
|
|
|
"""Saves internally stored :class:`requests.Session` object."""
|
2017-06-24 22:43:40 +02:00
|
|
|
if filename is None:
|
|
|
|
filename = get_default_session_filename(self.username)
|
|
|
|
dirname = os.path.dirname(filename)
|
|
|
|
if dirname != '' and not os.path.exists(dirname):
|
|
|
|
os.makedirs(dirname)
|
|
|
|
os.chmod(dirname, 0o700)
|
|
|
|
with open(filename, 'wb') as sessionfile:
|
|
|
|
os.chmod(filename, 0o600)
|
|
|
|
pickle.dump(requests.utils.dict_from_cookiejar(self.session.cookies), sessionfile)
|
|
|
|
self._log("Saved session to %s." % filename)
|
|
|
|
|
|
|
|
def load_session_from_file(self, username: str, filename: Optional[str] = None) -> None:
|
2017-09-08 10:35:38 +02:00
|
|
|
"""Internally stores :class:`requests.Session` object loaded from file.
|
2017-07-14 11:00:22 +02:00
|
|
|
|
|
|
|
If filename is None, the file with the default session path is loaded.
|
|
|
|
|
2017-08-30 09:50:26 +02:00
|
|
|
:raises FileNotFoundError: If the file does not exist.
|
2017-07-14 11:00:22 +02:00
|
|
|
"""
|
2017-06-24 22:43:40 +02:00
|
|
|
if filename is None:
|
|
|
|
filename = get_default_session_filename(username)
|
2017-07-14 11:00:22 +02:00
|
|
|
with open(filename, 'rb') as sessionfile:
|
|
|
|
session = requests.Session()
|
|
|
|
session.cookies = requests.utils.cookiejar_from_dict(pickle.load(sessionfile))
|
2017-08-11 17:50:37 +02:00
|
|
|
session.headers.update(self._default_http_header())
|
2017-07-14 11:00:22 +02:00
|
|
|
session.headers.update({'X-CSRFToken': session.cookies.get_dict()['csrftoken']})
|
|
|
|
self._log("Loaded session from %s." % filename)
|
|
|
|
self.session = session
|
|
|
|
self.username = username
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def test_login(self, session: Optional[requests.Session]) -> Optional[str]:
|
2017-09-08 10:35:38 +02:00
|
|
|
"""Returns the Instagram username to which given :class:`requests.Session` object belongs, or None."""
|
2017-08-06 19:27:46 +02:00
|
|
|
if session:
|
2017-08-19 12:58:28 +02:00
|
|
|
data = self.get_json('', params={'__a': 1}, session=session)
|
2017-08-06 19:27:46 +02:00
|
|
|
return data['graphql']['user']['username'] if 'graphql' in data else None
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
def login(self, user: str, passwd: str) -> None:
|
2017-06-30 15:09:54 +02:00
|
|
|
"""Log in to instagram with given username and password and internally store session object"""
|
2017-06-24 22:43:40 +02:00
|
|
|
session = requests.Session()
|
|
|
|
session.cookies.update({'sessionid': '', 'mid': '', 'ig_pr': '1',
|
|
|
|
'ig_vw': '1920', 'csrftoken': '',
|
|
|
|
's_network': '', 'ds_user_id': ''})
|
2017-08-11 17:50:37 +02:00
|
|
|
session.headers.update(self._default_http_header())
|
2017-07-26 15:08:11 +02:00
|
|
|
self._sleep()
|
2017-06-24 22:43:40 +02:00
|
|
|
resp = session.get('https://www.instagram.com/')
|
|
|
|
session.headers.update({'X-CSRFToken': resp.cookies['csrftoken']})
|
2017-07-26 15:08:11 +02:00
|
|
|
self._sleep()
|
2017-06-24 22:43:40 +02:00
|
|
|
login = session.post('https://www.instagram.com/accounts/login/ajax/',
|
|
|
|
data={'password': passwd, 'username': user}, allow_redirects=True)
|
|
|
|
session.headers.update({'X-CSRFToken': login.cookies['csrftoken']})
|
|
|
|
if login.status_code == 200:
|
|
|
|
if user == self.test_login(session):
|
|
|
|
self.username = user
|
|
|
|
self.session = session
|
|
|
|
else:
|
|
|
|
raise BadCredentialsException('Login error! Check your credentials!')
|
|
|
|
else:
|
|
|
|
raise ConnectionException('Login error! Connection error!')
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
def download_post(self, post: Post, target: str) -> bool:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
2017-08-06 19:27:46 +02:00
|
|
|
Download everything associated with one instagram post node, i.e. picture, caption and video.
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
:param post: Post to download.
|
2017-07-25 18:31:08 +02:00
|
|
|
:param target: Target name, i.e. profile name, #hashtag, :feed; for filename.
|
2017-06-24 22:43:40 +02:00
|
|
|
:return: True if something was downloaded, False otherwise, i.e. file was already there
|
|
|
|
"""
|
2017-08-19 12:58:28 +02:00
|
|
|
|
|
|
|
# Format dirname and filename. post.owner_username might do an additional request, so only access it, if
|
|
|
|
# {profile} is part of the dirname pattern or filename pattern.
|
2017-07-25 18:31:08 +02:00
|
|
|
needs_profilename = (format_string_contains_key(self.dirname_pattern, 'profile') or
|
|
|
|
format_string_contains_key(self.filename_pattern, 'profile'))
|
2017-08-19 12:58:28 +02:00
|
|
|
profilename = post.owner_username if needs_profilename else None
|
2017-07-25 18:31:08 +02:00
|
|
|
dirname = self.dirname_pattern.format(profile=profilename, target=target.lower())
|
|
|
|
filename = dirname + '/' + self.filename_pattern.format(profile=profilename, target=target.lower(),
|
2017-11-08 15:58:33 +01:00
|
|
|
date=post.date, shortcode=post.shortcode,
|
|
|
|
post=post)
|
2017-07-29 01:40:53 +02:00
|
|
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
2017-08-19 12:58:28 +02:00
|
|
|
|
2017-08-25 13:39:23 +02:00
|
|
|
# Download the image(s) / video thumbnail and videos within sidecars if desired
|
2017-12-14 16:46:19 +01:00
|
|
|
downloaded = False
|
2017-08-19 12:58:28 +02:00
|
|
|
if post.typename == 'GraphSidecar':
|
|
|
|
edge_number = 1
|
2017-08-20 10:33:35 +02:00
|
|
|
for edge in post.get_sidecar_edges():
|
2017-12-14 16:46:19 +01:00
|
|
|
# Download picture or video thumbnail
|
|
|
|
if not edge['node']['is_video'] or self.download_video_thumbnails is Tristate.always:
|
|
|
|
downloaded |= self.download_pic(filename=filename,
|
2017-08-19 12:58:28 +02:00
|
|
|
url=edge['node']['display_url'],
|
|
|
|
mtime=post.date,
|
|
|
|
filename_suffix=str(edge_number))
|
2017-08-25 13:39:23 +02:00
|
|
|
# Additionally download video if available and desired
|
|
|
|
if edge['node']['is_video'] and self.download_videos is Tristate.always:
|
2017-12-14 16:46:19 +01:00
|
|
|
downloaded |= self.download_pic(filename=filename,
|
|
|
|
url=edge['node']['video_url'],
|
|
|
|
mtime=post.date,
|
|
|
|
filename_suffix=str(edge_number))
|
2017-08-19 12:58:28 +02:00
|
|
|
edge_number += 1
|
2017-12-14 16:46:19 +01:00
|
|
|
elif post.typename == 'GraphImage':
|
2017-08-19 12:58:28 +02:00
|
|
|
downloaded = self.download_pic(filename=filename, url=post.url, mtime=post.date)
|
2017-12-14 16:46:19 +01:00
|
|
|
elif post.typename == 'GraphVideo':
|
|
|
|
if self.download_video_thumbnails is Tristate.always:
|
|
|
|
downloaded = self.download_pic(filename=filename, url=post.url, mtime=post.date)
|
2017-06-24 22:43:40 +02:00
|
|
|
else:
|
2017-08-19 12:58:28 +02:00
|
|
|
self.error("Warning: {0} has unknown typename: {1}".format(post, post.typename))
|
|
|
|
|
|
|
|
# Save caption if desired
|
2017-09-29 14:02:58 +02:00
|
|
|
if self.save_captions is not Tristate.never:
|
2017-08-19 12:58:28 +02:00
|
|
|
if post.caption:
|
|
|
|
self.save_caption(filename, post.date, post.caption)
|
2017-08-11 19:51:00 +02:00
|
|
|
else:
|
|
|
|
self._log("<no caption>", end=' ', flush=True)
|
2017-08-19 12:58:28 +02:00
|
|
|
|
|
|
|
# Download video if desired
|
|
|
|
if post.is_video and self.download_videos is Tristate.always:
|
2017-12-14 16:46:19 +01:00
|
|
|
downloaded |= self.download_pic(filename=filename, url=post.video_url, mtime=post.date)
|
2017-08-19 12:58:28 +02:00
|
|
|
|
|
|
|
# Download geotags if desired
|
2017-08-11 19:51:00 +02:00
|
|
|
if self.download_geotags is Tristate.always:
|
2017-08-19 12:58:28 +02:00
|
|
|
location = post.get_location()
|
2017-06-24 22:43:40 +02:00
|
|
|
if location:
|
2017-08-19 12:58:28 +02:00
|
|
|
self.save_location(filename, location, post.date)
|
|
|
|
|
|
|
|
# Update comments if desired
|
2017-08-11 19:51:00 +02:00
|
|
|
if self.download_comments is Tristate.always:
|
2017-08-19 12:58:28 +02:00
|
|
|
self.update_comments(filename, post)
|
|
|
|
|
2017-09-29 14:02:58 +02:00
|
|
|
# Save metadata as JSON if desired. It might require an extra query, depending on which information has been
|
|
|
|
# already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query.
|
|
|
|
if self.save_metadata is Tristate.always:
|
|
|
|
self.save_metadata_json(filename, post)
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
self._log()
|
|
|
|
return downloaded
|
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def get_stories(self, userids: Optional[List[int]] = None) -> Iterator[Dict[str, Any]]:
|
|
|
|
"""Get available stories from followees or all stories of users whose ID are given.
|
|
|
|
Does not mark stories as seen.
|
|
|
|
To use this, one needs to be logged in
|
|
|
|
|
|
|
|
:param userids: List of user IDs to be processed in terms of downloading their stories, or None.
|
|
|
|
"""
|
|
|
|
tempsession = copy_session(self.session)
|
|
|
|
header = tempsession.headers
|
|
|
|
header['User-Agent'] = 'Instagram 10.3.2 (iPhone7,2; iPhone OS 9_3_3; en_US; en-US; scale=2.00; 750x1334) ' \
|
|
|
|
'AppleWebKit/420+'
|
|
|
|
del header['Host']
|
|
|
|
del header['Origin']
|
|
|
|
del header['X-Instagram-AJAX']
|
|
|
|
del header['X-Requested-With']
|
|
|
|
|
|
|
|
def _get(url):
|
|
|
|
self._sleep()
|
|
|
|
resp = tempsession.get(url)
|
|
|
|
if resp.status_code != 200:
|
|
|
|
raise ConnectionException('Failed to fetch stories.')
|
|
|
|
return json.loads(resp.text)
|
|
|
|
|
|
|
|
url_reel_media = 'https://i.instagram.com/api/v1/feed/user/{0}/reel_media/'
|
|
|
|
url_reels_tray = 'https://i.instagram.com/api/v1/feed/reels_tray/'
|
|
|
|
if userids is not None:
|
|
|
|
for userid in userids:
|
|
|
|
yield _get(url_reel_media.format(userid))
|
|
|
|
else:
|
|
|
|
data = _get(url_reels_tray)
|
2017-08-20 20:32:45 +02:00
|
|
|
if 'tray' not in data:
|
2017-08-06 19:27:46 +02:00
|
|
|
raise BadResponseException('Bad story reel JSON.')
|
|
|
|
for user in data["tray"]:
|
|
|
|
yield user if "items" in user else _get(url_reel_media.format(user['user']['pk']))
|
|
|
|
|
2017-07-27 16:59:21 +02:00
|
|
|
def download_stories(self,
|
2017-07-28 19:49:48 +02:00
|
|
|
userids: Optional[List[int]] = None,
|
2017-07-29 17:51:39 +02:00
|
|
|
fast_update: bool = False,
|
|
|
|
filename_target: str = ':stories') -> None:
|
2017-07-27 16:59:21 +02:00
|
|
|
"""
|
2017-07-29 04:12:26 +02:00
|
|
|
Download available stories from user followees or all stories of users whose ID are given.
|
2017-07-28 05:22:43 +02:00
|
|
|
Does not mark stories as seen.
|
2017-07-29 04:12:26 +02:00
|
|
|
To use this, one needs to be logged in
|
2017-07-27 16:59:21 +02:00
|
|
|
|
2017-07-28 19:49:48 +02:00
|
|
|
:param userids: List of user IDs to be processed in terms of downloading their stories
|
|
|
|
:param fast_update: If true, abort when first already-downloaded picture is encountered
|
2017-07-29 17:51:39 +02:00
|
|
|
:param filename_target: Replacement for {target} in dirname_pattern and filename_pattern
|
2017-07-27 16:59:21 +02:00
|
|
|
"""
|
|
|
|
|
2017-11-08 15:58:33 +01:00
|
|
|
if format_string_contains_key(self.filename_pattern, 'post'):
|
|
|
|
raise InvalidArgumentException("The \"post\" keyword is not supported in the filename pattern when "
|
|
|
|
"downloading stories.")
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
if not self.is_logged_in:
|
2017-07-27 22:18:43 +02:00
|
|
|
raise LoginRequiredException('Login required to download stories')
|
2017-07-27 16:59:21 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
for user_stories in self.get_stories(userids):
|
2017-07-27 16:59:21 +02:00
|
|
|
if "items" not in user_stories:
|
2017-08-20 20:32:45 +02:00
|
|
|
raise BadResponseException('Bad reel media JSON.')
|
2017-07-27 16:59:21 +02:00
|
|
|
name = user_stories["user"]["username"].lower()
|
2017-07-29 17:51:39 +02:00
|
|
|
self._log("Retrieving stories from profile {}.".format(name))
|
2017-08-20 20:32:45 +02:00
|
|
|
totalcount = len(user_stories["items"])
|
2017-07-28 05:22:43 +02:00
|
|
|
count = 1
|
2017-07-27 16:59:21 +02:00
|
|
|
for item in user_stories["items"]:
|
|
|
|
self._log("[%3i/%3i] " % (count, totalcount), end="", flush=True)
|
|
|
|
count += 1
|
2017-12-14 16:17:46 +01:00
|
|
|
with self._error_catcher('Download story from user {}'.format(name)):
|
|
|
|
downloaded = self.download_story(item, filename_target, name)
|
2017-08-06 19:27:46 +02:00
|
|
|
if fast_update and not downloaded:
|
|
|
|
break
|
|
|
|
|
2017-12-14 16:17:46 +01:00
|
|
|
def download_story(self, item: Dict[str, Any], target: str, profile: str) -> bool:
|
|
|
|
"""Download one user story.
|
|
|
|
|
|
|
|
:param item: Story item, as in story['items'] for story in :meth:`get_stories`
|
|
|
|
:param target: Replacement for {target} in dirname_pattern and filename_pattern
|
|
|
|
:param profile: Owner profile name
|
|
|
|
:return: True if something was downloaded, False otherwise, i.e. file was already there
|
|
|
|
"""
|
|
|
|
|
|
|
|
shortcode = item["code"] if "code" in item else "no_code"
|
|
|
|
date = datetime.fromtimestamp(item["taken_at"])
|
|
|
|
dirname = self.dirname_pattern.format(profile=profile, target=target)
|
|
|
|
filename = dirname + '/' + self.filename_pattern.format(profile=profile, target=target,
|
|
|
|
date=date,
|
|
|
|
shortcode=shortcode)
|
|
|
|
os.makedirs(os.path.dirname(filename), exist_ok=True)
|
2017-12-14 16:46:19 +01:00
|
|
|
downloaded = False
|
2017-12-14 16:17:46 +01:00
|
|
|
if "image_versions2" in item:
|
2017-12-14 16:46:19 +01:00
|
|
|
if "video_versions" not in item or self.download_video_thumbnails is Tristate.always:
|
|
|
|
url = item["image_versions2"]["candidates"][0]["url"]
|
|
|
|
downloaded = self.download_pic(filename=filename,
|
|
|
|
url=url,
|
|
|
|
mtime=date)
|
2017-12-14 16:17:46 +01:00
|
|
|
else:
|
|
|
|
self._log("Warning: Unable to find story image.")
|
|
|
|
if "caption" in item and item["caption"] is not None and \
|
|
|
|
self.save_captions is not Tristate.never:
|
|
|
|
caption = item["caption"]
|
|
|
|
if isinstance(caption, dict) and "text" in caption:
|
|
|
|
caption = caption["text"]
|
|
|
|
self.save_caption(filename, date, caption)
|
|
|
|
else:
|
|
|
|
self._log("<no caption>", end=' ', flush=True)
|
|
|
|
if "video_versions" in item and self.download_videos is Tristate.always:
|
2017-12-14 16:46:19 +01:00
|
|
|
downloaded |= self.download_pic(filename=filename,
|
|
|
|
url=item["video_versions"][0]["url"],
|
|
|
|
mtime=date)
|
2017-12-14 16:17:46 +01:00
|
|
|
if item["story_locations"] and self.download_geotags is not Tristate.never:
|
|
|
|
location = item["story_locations"][0]["location"]
|
|
|
|
if location:
|
|
|
|
self.save_location(filename, location, date)
|
|
|
|
self._log()
|
|
|
|
return downloaded
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
def get_feed_posts(self) -> Iterator[Post]:
|
2017-08-06 19:27:46 +02:00
|
|
|
"""Get Posts of the user's feed."""
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
data = self.get_json('', params={'__a': 1})
|
2017-08-06 19:27:46 +02:00
|
|
|
|
|
|
|
while True:
|
|
|
|
if "graphql" in data:
|
|
|
|
is_edge = True
|
|
|
|
feed = data["graphql"]["user"]["edge_web_feed_timeline"]
|
|
|
|
elif "data" in data:
|
|
|
|
is_edge = True
|
|
|
|
feed = data["data"]["user"]["edge_web_feed_timeline"]
|
|
|
|
else:
|
|
|
|
is_edge = False
|
|
|
|
feed = data["feed"]["media"]
|
2017-07-27 16:59:21 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
if is_edge:
|
2017-08-19 12:58:28 +02:00
|
|
|
yield from (Post(self, edge["node"]) for edge in feed["edges"])
|
2017-08-06 19:27:46 +02:00
|
|
|
else:
|
2017-08-19 12:58:28 +02:00
|
|
|
yield from (Post(self, node) for node in feed["nodes"])
|
2017-08-06 19:27:46 +02:00
|
|
|
|
|
|
|
if not feed["page_info"]["has_next_page"]:
|
|
|
|
break
|
|
|
|
data = self.graphql_query(17863003771166879, {'fetch_media_item_count': 12,
|
|
|
|
'fetch_media_item_cursor': feed["page_info"]["end_cursor"],
|
|
|
|
'fetch_comment_count': 4,
|
|
|
|
'fetch_like': 10})
|
|
|
|
|
|
|
|
def download_feed_posts(self, max_count: int = None, fast_update: bool = False,
|
2017-08-19 12:58:28 +02:00
|
|
|
filter_func: Optional[Callable[[Post], bool]] = None) -> None:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
|
|
|
Download pictures from the user's feed.
|
|
|
|
|
2017-09-08 10:35:38 +02:00
|
|
|
Example to download up to the 20 pics the user last liked::
|
2017-08-29 12:13:38 +02:00
|
|
|
|
|
|
|
loader = Instaloader()
|
|
|
|
loader.load_session_from_file('USER')
|
|
|
|
loader.download_feed_posts(max_count=20, fast_update=True,
|
|
|
|
filter_func=lambda post: post.viewer_has_liked)
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
:param max_count: Maximum count of pictures to download
|
|
|
|
:param fast_update: If true, abort when first already-downloaded picture is encountered
|
2017-08-19 12:58:28 +02:00
|
|
|
:param filter_func: function(post), which returns True if given picture should be downloaded
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
|
|
|
count = 1
|
2017-08-06 19:27:46 +02:00
|
|
|
for post in self.get_feed_posts():
|
|
|
|
if max_count is not None and count > max_count:
|
2017-06-24 22:43:40 +02:00
|
|
|
break
|
2017-08-19 12:58:28 +02:00
|
|
|
name = post.owner_username
|
|
|
|
if filter_func is not None and not filter_func(post):
|
2017-08-06 19:27:46 +02:00
|
|
|
self._log("<pic by %s skipped>" % name, flush=True)
|
|
|
|
continue
|
|
|
|
self._log("[%3i] %s " % (count, name), end="", flush=True)
|
|
|
|
count += 1
|
|
|
|
with self._error_catcher('Download feed'):
|
2017-08-19 12:58:28 +02:00
|
|
|
downloaded = self.download_post(post, target=':feed')
|
2017-08-06 19:27:46 +02:00
|
|
|
if fast_update and not downloaded:
|
|
|
|
break
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
def get_hashtag_posts(self, hashtag: str) -> Iterator[Post]:
|
2017-08-06 19:27:46 +02:00
|
|
|
"""Get Posts associated with a #hashtag."""
|
2017-08-19 12:58:28 +02:00
|
|
|
yield from (Post(self, node) for node in
|
|
|
|
self.graphql_node_list(17875800862117404, {'tag_name': hashtag},
|
|
|
|
'https://www.instagram.com/explore/tags/{0}/'.format(hashtag),
|
|
|
|
lambda d: d['data']['hashtag']['edge_hashtag_to_media']))
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
def download_hashtag(self, hashtag: str,
|
|
|
|
max_count: Optional[int] = None,
|
2017-08-19 12:58:28 +02:00
|
|
|
filter_func: Optional[Callable[[Post], bool]] = None,
|
2017-08-11 19:51:00 +02:00
|
|
|
fast_update: bool = False) -> None:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""Download pictures of one hashtag.
|
|
|
|
|
2017-09-08 10:35:38 +02:00
|
|
|
To download the last 30 pictures with hashtag #cat, do::
|
2017-08-29 12:13:38 +02:00
|
|
|
|
|
|
|
loader = Instaloader()
|
|
|
|
loader.download_hashtag('cat', max_count=30)
|
2017-06-24 22:43:40 +02:00
|
|
|
|
|
|
|
:param hashtag: Hashtag to download, without leading '#'
|
|
|
|
:param max_count: Maximum count of pictures to download
|
2017-08-19 12:58:28 +02:00
|
|
|
:param filter_func: function(post), which returns True if given picture should be downloaded
|
2017-06-24 22:43:40 +02:00
|
|
|
:param fast_update: If true, abort when first already-downloaded picture is encountered
|
|
|
|
"""
|
2017-08-19 18:26:42 +02:00
|
|
|
hashtag = hashtag.lower()
|
2017-06-24 22:43:40 +02:00
|
|
|
count = 1
|
2017-08-06 19:27:46 +02:00
|
|
|
for post in self.get_hashtag_posts(hashtag):
|
|
|
|
if max_count is not None and count > max_count:
|
2017-06-24 22:43:40 +02:00
|
|
|
break
|
2017-08-06 19:27:46 +02:00
|
|
|
self._log('[{0:3d}] #{1} '.format(count, hashtag), end='', flush=True)
|
2017-08-19 12:58:28 +02:00
|
|
|
if filter_func is not None and not filter_func(post):
|
2017-08-06 19:27:46 +02:00
|
|
|
self._log('<skipped>')
|
|
|
|
continue
|
|
|
|
count += 1
|
|
|
|
with self._error_catcher('Download hashtag #{}'.format(hashtag)):
|
2017-08-19 12:58:28 +02:00
|
|
|
downloaded = self.download_post(post, target='#' + hashtag)
|
2017-08-06 19:27:46 +02:00
|
|
|
if fast_update and not downloaded:
|
|
|
|
break
|
2017-04-17 12:10:43 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def check_profile_id(self, profile: str, profile_metadata: Optional[Dict[str, Any]] = None) -> Tuple[str, int]:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
|
|
|
Consult locally stored ID of profile with given name, check whether ID matches and whether name
|
|
|
|
has changed and return current name of the profile, and store ID of profile.
|
2017-07-29 17:51:39 +02:00
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
:param profile: Profile name
|
2017-09-08 10:35:38 +02:00
|
|
|
:param profile_metadata:
|
|
|
|
The profile's metadata (:meth:`get_profile_metadata`), or None if the profile was not found
|
2017-07-29 17:51:39 +02:00
|
|
|
:return: current profile name, profile id
|
2017-06-24 22:43:40 +02:00
|
|
|
"""
|
2017-08-06 19:27:46 +02:00
|
|
|
profile_exists = profile_metadata is not None
|
2017-07-25 18:31:08 +02:00
|
|
|
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
|
|
|
format_string_contains_key(self.dirname_pattern, 'target'))):
|
|
|
|
id_filename = '{0}/id'.format(self.dirname_pattern.format(profile=profile.lower(),
|
|
|
|
target=profile.lower()))
|
2017-06-25 14:53:43 +02:00
|
|
|
else:
|
2017-07-25 18:31:08 +02:00
|
|
|
id_filename = '{0}/{1}_id'.format(self.dirname_pattern.format(), profile.lower())
|
2016-07-25 22:27:23 +02:00
|
|
|
try:
|
2017-06-25 14:53:43 +02:00
|
|
|
with open(id_filename, 'rb') as id_file:
|
2017-06-24 22:43:40 +02:00
|
|
|
profile_id = int(id_file.read())
|
|
|
|
if (not profile_exists) or \
|
2017-08-06 19:27:46 +02:00
|
|
|
(profile_id != int(profile_metadata['user']['id'])):
|
2017-06-24 22:43:40 +02:00
|
|
|
if profile_exists:
|
2017-07-14 05:18:18 +02:00
|
|
|
self._log("Profile {0} does not match the stored unique ID {1}.".format(profile, profile_id))
|
|
|
|
else:
|
|
|
|
self._log("Trying to find profile {0} using its unique ID {1}.".format(profile, profile_id))
|
|
|
|
newname = self.get_username_by_id(profile_id)
|
|
|
|
self._log("Profile {0} has changed its name to {1}.".format(profile, newname))
|
2017-07-25 18:31:08 +02:00
|
|
|
if ((format_string_contains_key(self.dirname_pattern, 'profile') or
|
|
|
|
format_string_contains_key(self.dirname_pattern, 'target'))):
|
|
|
|
os.rename(self.dirname_pattern.format(profile=profile.lower(),
|
|
|
|
target=profile.lower()),
|
|
|
|
self.dirname_pattern.format(profile=newname.lower(),
|
|
|
|
target=newname.lower()))
|
|
|
|
else:
|
|
|
|
os.rename('{0}/{1}_id'.format(self.dirname_pattern.format(), profile.lower()),
|
|
|
|
'{0}/{1}_id'.format(self.dirname_pattern.format(), newname.lower()))
|
2017-07-29 17:51:39 +02:00
|
|
|
return newname, profile_id
|
|
|
|
return profile, profile_id
|
2017-06-24 22:43:40 +02:00
|
|
|
except FileNotFoundError:
|
|
|
|
pass
|
|
|
|
if profile_exists:
|
2017-07-25 18:31:08 +02:00
|
|
|
os.makedirs(self.dirname_pattern.format(profile=profile.lower(),
|
|
|
|
target=profile.lower()), exist_ok=True)
|
2017-06-25 14:53:43 +02:00
|
|
|
with open(id_filename, 'w') as text_file:
|
2017-08-06 19:27:46 +02:00
|
|
|
profile_id = profile_metadata['user']['id']
|
2017-06-24 22:43:40 +02:00
|
|
|
text_file.write(profile_id + "\n")
|
|
|
|
self._log("Stored ID {0} for profile {1}.".format(profile_id, profile))
|
2017-07-29 17:51:39 +02:00
|
|
|
return profile, profile_id
|
2017-06-24 22:43:40 +02:00
|
|
|
raise ProfileNotExistsException("Profile {0} does not exist.".format(profile))
|
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def get_profile_metadata(self, profile_name: str) -> Dict[str, Any]:
|
2017-09-08 10:35:38 +02:00
|
|
|
"""Retrieves a profile's metadata, for use with e.g. :meth:`get_profile_posts` and :meth:`check_profile_id`."""
|
2017-08-06 19:27:46 +02:00
|
|
|
try:
|
2017-08-19 12:58:28 +02:00
|
|
|
return self.get_json('{}/'.format(profile_name), params={'__a': 1})
|
2017-08-06 19:27:46 +02:00
|
|
|
except QueryReturnedNotFoundException:
|
|
|
|
raise ProfileNotExistsException('Profile {} does not exist.'.format(profile_name))
|
|
|
|
|
2017-08-19 12:58:28 +02:00
|
|
|
def get_profile_posts(self, profile_metadata: Dict[str, Any]) -> Iterator[Post]:
|
2017-08-06 19:27:46 +02:00
|
|
|
"""Retrieve all posts from a profile."""
|
2017-08-19 12:58:28 +02:00
|
|
|
profile_name = profile_metadata['user']['username']
|
2017-11-08 15:58:33 +01:00
|
|
|
profile_id = int(profile_metadata['user']['id'])
|
|
|
|
yield from (Post(self, node, profile=profile_name, profile_id=profile_id)
|
|
|
|
for node in profile_metadata['user']['media']['nodes'])
|
2017-08-06 19:27:46 +02:00
|
|
|
has_next_page = profile_metadata['user']['media']['page_info']['has_next_page']
|
|
|
|
end_cursor = profile_metadata['user']['media']['page_info']['end_cursor']
|
|
|
|
while has_next_page:
|
2017-08-11 17:50:37 +02:00
|
|
|
# We do not use self.graphql_node_list() here, because profile_metadata
|
|
|
|
# lets us obtain the first 12 nodes 'for free'
|
2017-08-06 19:27:46 +02:00
|
|
|
data = self.graphql_query(17888483320059182, {'id': profile_metadata['user']['id'],
|
2017-12-11 21:11:18 +01:00
|
|
|
'first': 200,
|
2017-08-06 19:27:46 +02:00
|
|
|
'after': end_cursor},
|
2017-08-19 12:58:28 +02:00
|
|
|
'https://www.instagram.com/{0}/'.format(profile_name))
|
2017-08-06 19:27:46 +02:00
|
|
|
media = data['data']['user']['edge_owner_to_timeline_media']
|
2017-11-08 15:58:33 +01:00
|
|
|
yield from (Post(self, edge['node'], profile=profile_name, profile_id=profile_id)
|
|
|
|
for edge in media['edges'])
|
2017-08-06 19:27:46 +02:00
|
|
|
has_next_page = media['page_info']['has_next_page']
|
|
|
|
end_cursor = media['page_info']['end_cursor']
|
|
|
|
|
|
|
|
def download_profile(self, name: str,
|
2017-12-14 15:46:42 +01:00
|
|
|
profile_pic: bool = True, profile_pic_only: bool = False,
|
|
|
|
fast_update: bool = False,
|
2017-08-19 12:58:28 +02:00
|
|
|
download_stories: bool = False, download_stories_only: bool = False,
|
|
|
|
filter_func: Optional[Callable[[Post], bool]] = None) -> None:
|
2017-06-24 22:43:40 +02:00
|
|
|
"""Download one profile"""
|
2017-08-19 18:26:42 +02:00
|
|
|
name = name.lower()
|
2017-08-06 19:27:46 +02:00
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
# Get profile main page json
|
2017-08-06 19:27:46 +02:00
|
|
|
profile_metadata = None
|
|
|
|
with suppress(ProfileNotExistsException):
|
|
|
|
# ProfileNotExistsException is raised again later in check_profile_id() when we search the profile, so we
|
2017-08-11 17:50:37 +02:00
|
|
|
# must suppress it here.
|
2017-08-06 19:27:46 +02:00
|
|
|
profile_metadata = self.get_profile_metadata(name)
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
# check if profile does exist or name has changed since last download
|
|
|
|
# and update name and json data if necessary
|
2017-08-06 19:27:46 +02:00
|
|
|
name_updated, profile_id = self.check_profile_id(name, profile_metadata)
|
2017-06-24 22:43:40 +02:00
|
|
|
if name_updated != name:
|
|
|
|
name = name_updated
|
2017-08-06 19:27:46 +02:00
|
|
|
profile_metadata = self.get_profile_metadata(name)
|
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
# Download profile picture
|
2017-12-14 15:46:42 +01:00
|
|
|
if profile_pic or profile_pic_only:
|
|
|
|
with self._error_catcher('Download profile picture of {}'.format(name)):
|
|
|
|
self.download_profilepic(name, profile_metadata["user"]["profile_pic_url"])
|
2017-06-24 22:43:40 +02:00
|
|
|
if profile_pic_only:
|
|
|
|
return
|
2017-08-06 19:27:46 +02:00
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
# Catch some errors
|
2017-08-06 19:27:46 +02:00
|
|
|
if profile_metadata["user"]["is_private"]:
|
2017-08-19 12:58:28 +02:00
|
|
|
if not self.is_logged_in:
|
2017-06-24 22:43:40 +02:00
|
|
|
raise LoginRequiredException("profile %s requires login" % name)
|
2017-08-06 19:27:46 +02:00
|
|
|
if not profile_metadata["user"]["followed_by_viewer"] and \
|
|
|
|
self.username != profile_metadata["user"]["username"]:
|
2017-06-24 22:43:40 +02:00
|
|
|
raise PrivateProfileNotFollowedException("Profile %s: private but not followed." % name)
|
|
|
|
else:
|
2017-08-19 12:58:28 +02:00
|
|
|
if self.is_logged_in and not (download_stories or download_stories_only):
|
2017-06-24 22:43:40 +02:00
|
|
|
self._log("profile %s could also be downloaded anonymously." % name)
|
2017-08-06 19:27:46 +02:00
|
|
|
|
|
|
|
# Download stories, if requested
|
2017-07-29 17:51:39 +02:00
|
|
|
if download_stories or download_stories_only:
|
2017-08-22 09:21:47 +02:00
|
|
|
with self._error_catcher("Download stories of {}".format(name)):
|
|
|
|
self.download_stories(userids=[profile_id], filename_target=name, fast_update=fast_update)
|
2017-07-29 17:51:39 +02:00
|
|
|
if download_stories_only:
|
|
|
|
return
|
2017-08-06 19:27:46 +02:00
|
|
|
|
|
|
|
if ("nodes" not in profile_metadata["user"]["media"] or
|
|
|
|
not profile_metadata["user"]["media"]["nodes"]):
|
2017-06-24 22:43:40 +02:00
|
|
|
raise ProfileHasNoPicsException("Profile %s: no pics found." % name)
|
|
|
|
|
|
|
|
# Iterate over pictures and download them
|
2017-07-29 17:51:39 +02:00
|
|
|
self._log("Retrieving posts from profile {}.".format(name))
|
2017-08-06 19:27:46 +02:00
|
|
|
totalcount = profile_metadata["user"]["media"]["count"]
|
2017-06-24 22:43:40 +02:00
|
|
|
count = 1
|
2017-08-06 19:27:46 +02:00
|
|
|
for post in self.get_profile_posts(profile_metadata):
|
|
|
|
self._log("[%3i/%3i] " % (count, totalcount), end="", flush=True)
|
|
|
|
count += 1
|
2017-08-19 12:58:28 +02:00
|
|
|
if filter_func is not None and not filter_func(post):
|
|
|
|
self._log('<skipped>')
|
|
|
|
continue
|
2017-08-06 19:27:46 +02:00
|
|
|
with self._error_catcher('Download profile {}'.format(name)):
|
2017-08-19 12:58:28 +02:00
|
|
|
downloaded = self.download_post(post, target=name)
|
2017-06-24 22:43:40 +02:00
|
|
|
if fast_update and not downloaded:
|
2017-08-06 19:27:46 +02:00
|
|
|
break
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2017-07-14 11:00:22 +02:00
|
|
|
def interactive_login(self, username: str) -> None:
|
|
|
|
"""Logs in and internally stores session, asking user for password interactively.
|
|
|
|
|
|
|
|
:raises LoginRequiredException: when in quiet mode."""
|
2017-06-24 22:43:40 +02:00
|
|
|
if self.quiet:
|
|
|
|
raise LoginRequiredException("Quiet mode requires given password or valid session file.")
|
2017-07-14 11:00:22 +02:00
|
|
|
password = None
|
2017-06-24 22:43:40 +02:00
|
|
|
while password is None:
|
|
|
|
password = getpass.getpass(prompt="Enter Instagram password for %s: " % username)
|
2016-07-26 17:03:32 +02:00
|
|
|
try:
|
2017-06-24 22:43:40 +02:00
|
|
|
self.login(username, password)
|
|
|
|
except BadCredentialsException as err:
|
2016-07-26 17:03:32 +02:00
|
|
|
print(err, file=sys.stderr)
|
2017-06-24 22:43:40 +02:00
|
|
|
password = None
|
|
|
|
|
2017-08-06 19:27:46 +02:00
|
|
|
def main(self, profilelist: List[str], username: Optional[str] = None, password: Optional[str] = None,
|
|
|
|
sessionfile: Optional[str] = None, max_count: Optional[int] = None,
|
2017-12-14 15:46:42 +01:00
|
|
|
profile_pic: bool = True, profile_pic_only: bool = False,
|
|
|
|
fast_update: bool = False,
|
2017-08-19 16:14:18 +02:00
|
|
|
stories: bool = False, stories_only: bool = False,
|
|
|
|
filter_str: Optional[str] = None) -> None:
|
2017-08-11 17:50:37 +02:00
|
|
|
"""Download set of profiles, hashtags etc. and handle logging in and session files if desired."""
|
2017-08-19 16:14:18 +02:00
|
|
|
# Parse and generate filter function
|
|
|
|
if filter_str is not None:
|
|
|
|
filter_func = filterstr_to_filterfunc(filter_str, username is not None)
|
|
|
|
self._log('Only download posts with property "{}".'.format(filter_str))
|
|
|
|
else:
|
|
|
|
filter_func = None
|
2017-06-24 22:43:40 +02:00
|
|
|
# Login, if desired
|
|
|
|
if username is not None:
|
2017-07-14 11:00:22 +02:00
|
|
|
try:
|
|
|
|
self.load_session_from_file(username, sessionfile)
|
|
|
|
except FileNotFoundError as err:
|
|
|
|
if sessionfile is not None:
|
|
|
|
print(err, file=sys.stderr)
|
|
|
|
self._log("Session file does not exist yet - Logging in.")
|
2017-08-19 12:58:28 +02:00
|
|
|
if not self.is_logged_in or username != self.test_login(self.session):
|
2017-07-14 11:00:22 +02:00
|
|
|
if password is not None:
|
|
|
|
self.login(username, password)
|
|
|
|
else:
|
|
|
|
self.interactive_login(username)
|
2017-06-24 22:43:40 +02:00
|
|
|
self._log("Logged in as %s." % username)
|
|
|
|
# Try block for KeyboardInterrupt (save session on ^C)
|
|
|
|
targets = set()
|
|
|
|
try:
|
|
|
|
# Generate set of targets
|
|
|
|
for pentry in profilelist:
|
|
|
|
if pentry[0] == '#':
|
|
|
|
self._log("Retrieving pictures with hashtag {0}".format(pentry))
|
2017-08-06 19:27:46 +02:00
|
|
|
with self._error_catcher():
|
2017-08-19 16:14:18 +02:00
|
|
|
self.download_hashtag(hashtag=pentry[1:], max_count=max_count, fast_update=fast_update,
|
|
|
|
filter_func=filter_func)
|
2017-07-14 11:00:22 +02:00
|
|
|
elif pentry[0] == '@':
|
|
|
|
if username is not None:
|
|
|
|
self._log("Retrieving followees of %s..." % pentry[1:])
|
2017-08-06 19:27:46 +02:00
|
|
|
with self._error_catcher():
|
|
|
|
followees = self.get_followees(pentry[1:])
|
|
|
|
targets.update([followee['username'] for followee in followees])
|
2017-07-14 11:00:22 +02:00
|
|
|
else:
|
2017-08-28 21:05:46 +02:00
|
|
|
self.error("--login=USERNAME required to download {}.".format(pentry))
|
2017-08-19 16:14:18 +02:00
|
|
|
elif pentry == ":feed":
|
2017-07-14 11:00:22 +02:00
|
|
|
if username is not None:
|
|
|
|
self._log("Retrieving pictures from your feed...")
|
2017-08-06 19:27:46 +02:00
|
|
|
with self._error_catcher():
|
|
|
|
self.download_feed_posts(fast_update=fast_update, max_count=max_count,
|
2017-08-19 16:14:18 +02:00
|
|
|
filter_func=filter_func)
|
2017-07-14 11:00:22 +02:00
|
|
|
else:
|
2017-08-28 21:05:46 +02:00
|
|
|
self.error("--login=USERNAME required to download {}.".format(pentry))
|
2017-07-29 17:51:39 +02:00
|
|
|
elif pentry == ":stories":
|
|
|
|
if username is not None:
|
2017-08-06 19:27:46 +02:00
|
|
|
with self._error_catcher():
|
2017-08-11 19:51:00 +02:00
|
|
|
self.download_stories(fast_update=fast_update)
|
2017-07-29 17:51:39 +02:00
|
|
|
else:
|
2017-08-28 21:05:46 +02:00
|
|
|
self.error("--login=USERNAME required to download {}.".format(pentry))
|
2017-06-24 22:43:40 +02:00
|
|
|
else:
|
|
|
|
targets.add(pentry)
|
|
|
|
if len(targets) > 1:
|
2017-08-19 16:14:18 +02:00
|
|
|
self._log("Downloading {} profiles: {}".format(len(targets), ','.join(targets)))
|
2017-06-24 22:43:40 +02:00
|
|
|
# Iterate through targets list and download them
|
|
|
|
for target in targets:
|
2017-08-06 19:27:46 +02:00
|
|
|
with self._error_catcher():
|
2017-06-24 22:43:40 +02:00
|
|
|
try:
|
2017-12-14 15:46:42 +01:00
|
|
|
self.download_profile(target, profile_pic, profile_pic_only, fast_update, stories, stories_only,
|
2017-08-19 16:14:18 +02:00
|
|
|
filter_func=filter_func)
|
2017-06-24 22:43:40 +02:00
|
|
|
except ProfileNotExistsException as err:
|
|
|
|
if username is not None:
|
2017-07-14 05:18:18 +02:00
|
|
|
self._log(err)
|
2017-07-20 15:24:57 +02:00
|
|
|
self._log("Trying again anonymously, helps in case you are just blocked.")
|
2017-08-11 19:51:00 +02:00
|
|
|
with self.anonymous_copy() as anonymous_loader:
|
|
|
|
with self._error_catcher():
|
2017-12-14 15:46:42 +01:00
|
|
|
anonymous_loader.download_profile(target, profile_pic, profile_pic_only,
|
|
|
|
fast_update, filter_func=filter_func)
|
2017-06-24 22:43:40 +02:00
|
|
|
else:
|
|
|
|
raise err
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
print("\nInterrupted by user.", file=sys.stderr)
|
|
|
|
# Save session if it is useful
|
|
|
|
if username is not None:
|
|
|
|
self.save_session_to_file(sessionfile)
|
2017-08-06 19:27:46 +02:00
|
|
|
if self.error_log:
|
|
|
|
print("\nErrors occured:", file=sys.stderr)
|
|
|
|
for err in self.error_log:
|
|
|
|
print(err, file=sys.stderr)
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2016-06-15 12:42:08 +02:00
|
|
|
|
2016-06-27 16:49:00 +02:00
|
|
|
def main():
|
2017-07-20 14:54:22 +02:00
|
|
|
parser = ArgumentParser(description=__doc__, add_help=False,
|
2017-08-26 12:42:04 +02:00
|
|
|
epilog="Report issues at https://github.com/Thammus/instaloader/issues. "
|
|
|
|
"The complete documentation can be found at "
|
|
|
|
"https://instaloader.readthedocs.io/.")
|
2017-07-20 14:54:22 +02:00
|
|
|
|
|
|
|
g_what = parser.add_argument_group('What to Download',
|
|
|
|
'Specify a list of profiles or #hashtags. For each of these, Instaloader '
|
2017-07-20 18:19:15 +02:00
|
|
|
'creates a folder and '
|
2017-07-20 14:54:22 +02:00
|
|
|
'downloads all posts along with the pictures\'s '
|
2017-07-20 18:19:15 +02:00
|
|
|
'captions and the current profile picture. '
|
|
|
|
'If an already-downloaded profile has been renamed, Instaloader automatically '
|
|
|
|
'finds it by its unique ID and renames the folder likewise.')
|
2017-07-20 14:54:22 +02:00
|
|
|
g_what.add_argument('profile', nargs='*', metavar='profile|#hashtag',
|
2017-04-17 12:10:43 +02:00
|
|
|
help='Name of profile or #hashtag to download. '
|
|
|
|
'Alternatively, if --login is given: @<profile> to download all followees of '
|
2017-08-19 16:14:18 +02:00
|
|
|
'<profile>; the special targets :feed to '
|
2017-07-29 17:51:39 +02:00
|
|
|
'download pictures from your feed; or :stories to download the stories of your '
|
|
|
|
'followees.')
|
2017-07-20 14:54:22 +02:00
|
|
|
g_what.add_argument('-P', '--profile-pic-only', action='store_true',
|
|
|
|
help='Only download profile picture.')
|
2017-12-14 15:46:42 +01:00
|
|
|
g_what.add_argument('--no-profile-pic', action='store_true',
|
|
|
|
help='Do not download profile picture.')
|
2017-08-24 16:03:24 +02:00
|
|
|
g_what.add_argument('-V', '--no-videos', action='store_true',
|
2017-07-20 14:54:22 +02:00
|
|
|
help='Do not download videos.')
|
2017-12-14 16:46:19 +01:00
|
|
|
g_what.add_argument('--no-video-thumbnails', action='store_true',
|
|
|
|
help='Do not download thumbnails of videos.')
|
2017-07-20 14:54:22 +02:00
|
|
|
g_what.add_argument('-G', '--geotags', action='store_true',
|
|
|
|
help='Download geotags when available. Geotags are stored as a '
|
|
|
|
'text file with the location\'s name and a Google Maps link. '
|
|
|
|
'This requires an additional request to the Instagram '
|
2017-06-27 09:17:06 +02:00
|
|
|
'server for each picture, which is why it is disabled by default.')
|
2017-08-11 19:51:00 +02:00
|
|
|
g_what.add_argument('--no-geotags', action='store_true',
|
|
|
|
help='Do not store geotags, even if they can be obtained without any additional request.')
|
2017-07-20 22:30:12 +02:00
|
|
|
g_what.add_argument('-C', '--comments', action='store_true',
|
|
|
|
help='Download and update comments for each post. '
|
|
|
|
'This requires an additional request to the Instagram '
|
|
|
|
'server for each post, which is why it is disabled by default.')
|
2017-08-11 19:51:00 +02:00
|
|
|
g_what.add_argument('--no-captions', action='store_true',
|
|
|
|
help='Do not store media captions, although no additional request is needed to obtain them.')
|
2017-09-29 14:02:58 +02:00
|
|
|
g_what.add_argument('--metadata-json', action='store_true',
|
|
|
|
help='Create a JSON file containing the metadata of each post. This does not include comments '
|
|
|
|
'nor geotags.')
|
2017-07-29 17:51:39 +02:00
|
|
|
g_what.add_argument('-s', '--stories', action='store_true',
|
|
|
|
help='Also download stories of each profile that is downloaded. Requires --login.')
|
|
|
|
g_what.add_argument('--stories-only', action='store_true',
|
|
|
|
help='Rather than downloading regular posts of each specified profile, only download '
|
2017-12-14 15:46:42 +01:00
|
|
|
'stories. Requires --login. Does not imply --no-profile-pic.')
|
2017-08-19 16:14:18 +02:00
|
|
|
g_what.add_argument('--only-if', metavar='filter',
|
|
|
|
help='Expression that, if given, must evaluate to True for each post to be downloaded. Must be '
|
|
|
|
'a syntactically valid python expression. Variables are evaluated to '
|
|
|
|
'instaloader.Post attributes. Example: --only-if=viewer_has_liked.')
|
2017-07-20 14:54:22 +02:00
|
|
|
|
|
|
|
g_stop = parser.add_argument_group('When to Stop Downloading',
|
|
|
|
'If none of these options are given, Instaloader goes through all pictures '
|
|
|
|
'matching the specified targets.')
|
|
|
|
g_stop.add_argument('-F', '--fast-update', action='store_true',
|
|
|
|
help='For each target, stop when encountering the first already-downloaded picture. This '
|
|
|
|
'flag is recommended when you use Instaloader to update your personal Instagram archive.')
|
|
|
|
g_stop.add_argument('-c', '--count',
|
2017-04-22 17:34:49 +02:00
|
|
|
help='Do not attempt to download more than COUNT posts. '
|
2017-10-30 18:38:03 +01:00
|
|
|
'Applies only to #hashtag and :feed.')
|
2017-07-20 14:54:22 +02:00
|
|
|
|
|
|
|
g_login = parser.add_argument_group('Login (Download Private Profiles)',
|
2017-07-20 18:19:15 +02:00
|
|
|
'Instaloader can login to Instagram. This allows downloading private profiles. '
|
2017-07-20 14:54:22 +02:00
|
|
|
'To login, pass the --login option. Your session cookie (not your password!) '
|
|
|
|
'will be saved to a local file to be reused next time you want Instaloader '
|
|
|
|
'to login.')
|
|
|
|
g_login.add_argument('-l', '--login', metavar='YOUR-USERNAME',
|
|
|
|
help='Login name (profile name) for your Instagram account.')
|
|
|
|
g_login.add_argument('-f', '--sessionfile',
|
|
|
|
help='Path for loading and storing session key file. '
|
|
|
|
'Defaults to ' + get_default_session_filename("<login_name>"))
|
|
|
|
g_login.add_argument('-p', '--password', metavar='YOUR-PASSWORD',
|
|
|
|
help='Password for your Instagram account. Without this option, '
|
|
|
|
'you\'ll be prompted for your password interactively if '
|
|
|
|
'there is not yet a valid session file.')
|
|
|
|
|
|
|
|
g_how = parser.add_argument_group('How to Download')
|
2017-07-25 18:31:08 +02:00
|
|
|
g_how.add_argument('--dirname-pattern',
|
|
|
|
help='Name of directory where to store posts. {profile} is replaced by the profile name, '
|
|
|
|
'{target} is replaced by the target you specified, i.e. either :feed, #hashtag or the '
|
|
|
|
'profile name. Defaults to \'{target}\'.')
|
|
|
|
g_how.add_argument('--filename-pattern',
|
|
|
|
help='Prefix of filenames. Posts are stored in the directory whose pattern is given with '
|
|
|
|
'--dirname-pattern. {profile} is replaced by the profile name, '
|
|
|
|
'{target} is replaced by the target you specified, i.e. either :feed, #hashtag or the '
|
2017-11-08 15:58:33 +01:00
|
|
|
'profile name. Also, the fields {date} and {shortcode} can be specified. In case of not '
|
|
|
|
'downloading stories, the attributes of the Post class can be used in addition, e.g. '
|
|
|
|
'{post.owner_id} or {post.mediaid}. Defaults to \'{date:%%Y-%%m-%%d_%%H-%%M-%%S}\'.')
|
2017-07-20 14:54:22 +02:00
|
|
|
g_how.add_argument('--user-agent',
|
|
|
|
help='User Agent to use for HTTP requests. Defaults to \'{}\'.'.format(default_user_agent()))
|
2017-08-20 11:28:12 +02:00
|
|
|
g_how.add_argument('-S', '--no-sleep', action='store_true', help=SUPPRESS)
|
2017-09-29 16:09:15 +02:00
|
|
|
g_how.add_argument('--max-connection-attempts', metavar='N', type=int, default=3,
|
|
|
|
help='Maximum number of connection attempts until a request is aborted. Defaults to 3. If a '
|
2017-10-30 18:38:03 +01:00
|
|
|
'connection fails, it can be manually skipped by hitting CTRL+C. Set this to 0 to retry '
|
2017-09-29 16:09:15 +02:00
|
|
|
'infinitely.')
|
2017-07-20 14:54:22 +02:00
|
|
|
|
|
|
|
g_misc = parser.add_argument_group('Miscellaneous Options')
|
|
|
|
g_misc.add_argument('-q', '--quiet', action='store_true',
|
2017-06-24 22:43:40 +02:00
|
|
|
help='Disable user interaction, i.e. do not print messages (except errors) and fail '
|
2017-07-20 14:54:22 +02:00
|
|
|
'if login credentials are needed but not given. This makes Instaloader suitable as a '
|
|
|
|
'cron job.')
|
|
|
|
g_misc.add_argument('-h', '--help', action='help', help='Show this help message and exit.')
|
|
|
|
g_misc.add_argument('--version', action='version', help='Show version number and exit.',
|
|
|
|
version=__version__)
|
|
|
|
|
2016-06-15 12:42:08 +02:00
|
|
|
args = parser.parse_args()
|
2016-07-26 10:57:29 +02:00
|
|
|
try:
|
2017-07-29 17:51:39 +02:00
|
|
|
if args.login is None and (args.stories or args.stories_only):
|
|
|
|
print("--login=USERNAME required to download stories.", file=sys.stderr)
|
|
|
|
args.stories = False
|
|
|
|
if args.stories_only:
|
|
|
|
raise SystemExit(1)
|
2017-08-11 19:51:00 +02:00
|
|
|
|
2017-08-19 16:14:18 +02:00
|
|
|
if ':feed-all' in args.profile or ':feed-liked' in args.profile:
|
|
|
|
raise SystemExit(":feed-all and :feed-liked were removed. Use :feed as target and "
|
|
|
|
"eventually --only-if=viewer_has_liked.")
|
|
|
|
|
2017-08-24 16:03:24 +02:00
|
|
|
download_videos = Tristate.always if not args.no_videos else Tristate.no_extra_query
|
2017-12-14 16:46:19 +01:00
|
|
|
download_video_thumbnails = Tristate.always if not args.no_video_thumbnails else Tristate.never
|
2017-08-11 19:51:00 +02:00
|
|
|
download_comments = Tristate.always if args.comments else Tristate.no_extra_query
|
2017-09-29 14:02:58 +02:00
|
|
|
save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
|
|
|
|
save_metadata = Tristate.always if args.metadata_json else Tristate.never
|
2017-08-11 19:51:00 +02:00
|
|
|
|
|
|
|
if args.geotags and args.no_geotags:
|
|
|
|
raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.")
|
|
|
|
elif args.geotags:
|
|
|
|
download_geotags = Tristate.always
|
|
|
|
elif args.no_geotags:
|
|
|
|
download_geotags = Tristate.never
|
|
|
|
else:
|
|
|
|
download_geotags = Tristate.no_extra_query
|
|
|
|
|
2017-08-24 16:03:24 +02:00
|
|
|
loader = Instaloader(sleep=not args.no_sleep, quiet=args.quiet,
|
2017-07-25 18:31:08 +02:00
|
|
|
user_agent=args.user_agent,
|
2017-08-11 19:51:00 +02:00
|
|
|
dirname_pattern=args.dirname_pattern, filename_pattern=args.filename_pattern,
|
2017-12-14 16:46:19 +01:00
|
|
|
download_videos=download_videos, download_video_thumbnails=download_video_thumbnails,
|
|
|
|
download_geotags=download_geotags,
|
2017-09-29 14:02:58 +02:00
|
|
|
save_captions=save_captions, download_comments=download_comments,
|
2017-09-29 16:09:15 +02:00
|
|
|
save_metadata=save_metadata, max_connection_attempts=args.max_connection_attempts)
|
2017-12-14 15:46:42 +01:00
|
|
|
loader.main(args.profile,
|
|
|
|
username=args.login.lower() if args.login is not None else None,
|
|
|
|
password=args.password,
|
|
|
|
sessionfile=args.sessionfile,
|
|
|
|
max_count=int(args.count) if args.count is not None else None,
|
|
|
|
profile_pic=not args.no_profile_pic,
|
|
|
|
profile_pic_only=args.profile_pic_only,
|
|
|
|
fast_update=args.fast_update,
|
|
|
|
stories=args.stories,
|
|
|
|
stories_only=args.stories_only,
|
|
|
|
filter_str=args.only_if)
|
2016-07-26 10:57:29 +02:00
|
|
|
except InstaloaderException as err:
|
|
|
|
raise SystemExit("Fatal error: %s" % err)
|
2016-06-27 16:49:00 +02:00
|
|
|
|
2017-06-24 22:43:40 +02:00
|
|
|
|
2016-06-27 16:49:00 +02:00
|
|
|
if __name__ == "__main__":
|
|
|
|
main()
|