1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-10-05 14:57:08 +02:00

Saving and reimporting of JSON files

Metadata JSON files are now created by default. They can later be given
as a target to redownload Posts, StoryItems or Profiles with new
settings.
This commit is contained in:
Alexander Graf 2018-04-12 22:01:26 +02:00
parent 5d249c5401
commit f0bebd0d96
5 changed files with 127 additions and 41 deletions

View File

@ -67,12 +67,13 @@ automatically **finds it by its unique ID** and renames the folder likewise.
Also **download stories** of each profile that is downloaded. Requires
:option:`--login`.
.. option:: --metadata-json
.. option:: --no-metadata-json
Create a JSON file containing the metadata of each post. This does not
include comments (see :option:`--comments`) nor geotags (see
:option:`--geotags`). The JSON files contain the properties of
:class:`instaloader.Post`.
Do not create a JSON file containing the metadata of each post.
.. option:: --no-compress-json
Do not xz compress JSON files, rather create pretty formatted JSONs.
.. option:: --stories-only

View File

@ -14,4 +14,5 @@ else:
from .exceptions import *
from .instaloader import Instaloader, Tristate
from .structures import Post, Profile, Story, StoryItem, shortcode_to_mediaid, mediaid_to_shortcode
from .structures import (Post, Profile, Story, StoryItem, load_structure_from_file, mediaid_to_shortcode,
save_structure_to_file, shortcode_to_mediaid)

View File

@ -7,7 +7,7 @@ from argparse import ArgumentParser, SUPPRESS
from typing import Callable, List, Optional
from . import (Instaloader, InstaloaderException, InvalidArgumentException, Post, Profile, ProfileNotExistsException,
Tristate, __version__)
StoryItem, Tristate, __version__, load_structure_from_file)
from .instaloader import get_default_session_filename
from .instaloadercontext import default_user_agent
@ -86,6 +86,22 @@ def _main(instaloader: Instaloader, targetlist: List[str],
try:
# Generate set of profiles, already downloading non-profile targets
for target in targetlist:
if (target.endswith('.json') or target.endswith('.json.xz')) and os.path.isfile(target):
with instaloader.context.error_catcher(target):
structure = load_structure_from_file(instaloader.context, target)
if isinstance(structure, Post):
instaloader.context.log("Downloading {} ({})".format(structure, target))
instaloader.download_post(structure, os.path.dirname(target))
elif isinstance(structure, StoryItem):
instaloader.context.log("Attempting to download {} ({})".format(structure, target))
instaloader.download_story(structure, os.path.dirname(target))
elif isinstance(structure, Profile):
instaloader.context.log("Going to download {} ({})".format(structure.username, target))
profiles.add(structure.username)
else:
raise InvalidArgumentException("{} JSON file not supported as target"
.format(structure.__class__.__name__))
continue
# strip '/' characters to be more shell-autocompletion-friendly
target = target.rstrip('/')
with instaloader.context.error_catcher(target):
@ -181,9 +197,12 @@ def main():
'server for each post, which is why it is disabled by default.')
g_what.add_argument('--no-captions', action='store_true',
help='Do not store media captions, although no additional request is needed to obtain them.')
g_what.add_argument('--no-metadata-json', action='store_true',
help='Do not create a JSON file containing the metadata of each post.')
g_what.add_argument('--metadata-json', action='store_true',
help='Create a JSON file containing the metadata of each post. This does not include comments '
'nor geotags.')
help=SUPPRESS)
g_what.add_argument('--no-compress-json', action='store_true',
help='Do not xz compress JSON files, rather create pretty formatted JSONs.')
g_what.add_argument('-s', '--stories', action='store_true',
help='Also download stories of each profile that is downloaded. Requires --login.')
g_what.add_argument('--stories-only', action='store_true',
@ -264,7 +283,7 @@ def main():
download_video_thumbnails = Tristate.always if not args.no_video_thumbnails else Tristate.never
download_comments = Tristate.always if args.comments else Tristate.no_extra_query
save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
save_metadata = Tristate.always if args.metadata_json else Tristate.never
save_metadata = Tristate.always if not args.no_metadata_json else Tristate.never
if args.geotags and args.no_geotags:
raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.")
@ -281,7 +300,8 @@ def main():
download_videos=download_videos, download_video_thumbnails=download_video_thumbnails,
download_geotags=download_geotags,
save_captions=save_captions, download_comments=download_comments,
save_metadata=save_metadata, max_connection_attempts=args.max_connection_attempts)
save_metadata=save_metadata, compress_json=not args.no_compress_json,
max_connection_attempts=args.max_connection_attempts)
_main(loader,
args.profile,
username=args.login.lower() if args.login is not None else None,

View File

@ -16,7 +16,7 @@ from typing import Any, Callable, Dict, Iterator, List, Optional
from .exceptions import *
from .instaloadercontext import InstaloaderContext
from .structures import Post, Profile, Story, StoryItem
from .structures import JsonExportable, Post, Profile, Story, StoryItem, save_structure_to_file
def get_default_session_filename(username: str) -> str:
@ -83,7 +83,8 @@ class Instaloader:
download_geotags: Tristate = Tristate.no_extra_query,
save_captions: Tristate = Tristate.no_extra_query,
download_comments: Tristate = Tristate.no_extra_query,
save_metadata: Tristate = Tristate.never,
save_metadata: Tristate = Tristate.no_extra_query,
compress_json: bool = True,
max_connection_attempts: int = 3):
self.context = InstaloaderContext(sleep, quiet, user_agent, max_connection_attempts)
@ -108,6 +109,7 @@ class Instaloader:
self.save_captions = save_captions
self.download_comments = download_comments
self.save_metadata = save_metadata
self.compress_json = compress_json
@contextmanager
def anonymous_copy(self):
@ -118,7 +120,7 @@ class Instaloader:
self.download_video_thumbnails,
self.download_geotags,
self.save_captions, self.download_comments,
self.save_metadata, self.context.max_connection_attempts)
self.save_metadata, self.compress_json, self.context.max_connection_attempts)
new_loader.context.previous_queries = self.context.previous_queries
yield new_loader
self.context.error_log.extend(new_loader.context.error_log)
@ -158,12 +160,16 @@ class Instaloader:
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
return True
def save_metadata_json(self, filename: str, post: Post) -> None:
"""Saves metadata JSON file of a :class:`Post`."""
filename += '.json'
with open(filename, 'w') as fp:
json.dump(post, fp=fp, indent=4, default=Post.json_encoder)
self.context.log('json', end=' ', flush=True)
def save_metadata_json(self, filename: str, structure: JsonExportable) -> None:
"""Saves metadata JSON file of a structure."""
if self.compress_json:
filename += '.json.xz'
else:
filename += '.json'
save_structure_to_file(structure, filename)
if isinstance(structure, (Post, StoryItem)):
# log 'json ' message when saving Post or StoryItem
self.context.log('json', end=' ', flush=True)
def update_comments(self, filename: str, post: Post, filename_alt: Optional[str] = None) -> None:
try:
@ -393,9 +399,8 @@ class Instaloader:
if self.download_comments is Tristate.always:
self.update_comments(filename=filename, filename_alt=filename_old, post=post)
# Save metadata as JSON if desired. It might require an extra query, depending on which information has been
# already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query.
if self.save_metadata is Tristate.always:
# Save metadata as JSON if desired.
if self.save_metadata is not Tristate.never:
self.save_metadata_json(filename, post)
self.context.log()
@ -489,6 +494,9 @@ class Instaloader:
filename_alt=filename_old,
url=item.video_url,
mtime=date_local)
# Save metadata as JSON if desired.
if self.save_metadata is not Tristate.never:
self.save_metadata_json(filename, item)
self.context.log()
return downloaded
@ -698,6 +706,12 @@ class Instaloader:
profile_name = profile.username
# Save metadata as JSON if desired.
if self.save_metadata is not Tristate.never:
json_filename = '{0}/{1}_{2}'.format(self.dirname_pattern.format(profile=profile_name, target=profile_name),
profile_name, profile.userid)
self.save_metadata_json(json_filename, profile)
if self.context.is_logged_in and profile.has_blocked_viewer and not profile.is_private:
# raising ProfileNotExistsException invokes "trying again anonymously" logic
raise ProfileNotExistsException("Profile {} has blocked you".format(profile_name))

View File

@ -1,8 +1,11 @@
import json
import lzma
import re
from base64 import b64decode, b64encode
from datetime import datetime
from typing import Any, Dict, Iterator, List, Optional
from typing import Any, Dict, Iterator, List, Optional, Union
from . import __version__
from .exceptions import *
from .instaloadercontext import InstaloaderContext
@ -65,6 +68,15 @@ class Post:
"""Create a post object from a given mediaid"""
return cls.from_shortcode(context, mediaid_to_shortcode(mediaid))
def get_node(self):
if self._full_metadata_dict:
node = self._full_metadata_dict
else:
node = self._node
if self._owner_profile:
node['owner'] = self.owner_profile.get_node()
return node
@property
def shortcode(self) -> str:
"""Media shortcode. URL of the post is instagram.com/p/<shortcode>/."""
@ -283,23 +295,6 @@ class Post:
params={'__a': 1})
return location_json["location"] if "location" in location_json else location_json['graphql']['location']
@staticmethod
def json_encoder(obj) -> Dict[str, Any]:
"""Convert instance of :class:`Post` to a JSON-serializable dictionary."""
if not isinstance(obj, Post):
raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__))
jsondict = {}
for prop in dir(Post):
if prop[0].isupper() or prop[0] == '_':
# skip uppercase and private properties
continue
val = obj.__getattribute__(prop)
if val is True or val is False or isinstance(val, (str, int, float, list)):
jsondict[prop] = val
elif isinstance(val, datetime):
jsondict[prop] = val.isoformat()
return jsondict
class Profile:
"""
@ -342,6 +337,14 @@ class Profile:
username = Post.from_mediaid(context, int(data['edges'][0]["node"]["id"])).owner_username
return cls(context, {'username': username.lower(), 'id': profile_id})
def get_node(self):
json_node = self._node.copy()
# remove posts
json_node.pop('edge_media_collections', None)
json_node.pop('edge_owner_to_timeline_media', None)
json_node.pop('edge_saved_media', None)
return json_node
def _obtain_metadata(self):
try:
if not self._rhx_gis:
@ -517,6 +520,12 @@ class StoryItem:
self._node = node
self._owner_profile = owner_profile
def get_node(self):
node = self._node
if self._owner_profile:
node['owner'] = self._owner_profile.get_node()
return node
@property
def mediaid(self) -> int:
"""The mediaid is a decimal representation of the media shortcode."""
@ -684,3 +693,44 @@ class Story:
def get_items(self) -> Iterator[StoryItem]:
"""Retrieve all items from a story."""
yield from (StoryItem(self._context, item, self.owner_profile) for item in self._node['items'])
JsonExportable = Union[Post, Profile, StoryItem]
def save_structure_to_file(structure: JsonExportable, filename: str):
json_structure = {'node': structure.get_node(),
'instaloader': {'version': __version__, 'node_type': structure.__class__.__name__}}
compress = filename.endswith('.xz')
if compress:
with lzma.open(filename, 'wt', check=lzma.CHECK_NONE) as fp:
json.dump(json_structure, fp=fp, separators=(',', ':'))
else:
with open(filename, 'wt') as fp:
json.dump(json_structure, fp=fp, indent=4, sort_keys=True)
def load_structure_from_file(context: InstaloaderContext, filename: str) -> JsonExportable:
compressed = filename.endswith('.xz')
if compressed:
fp = lzma.open(filename, 'rt')
else:
fp = open(filename, 'rt')
json_structure = json.load(fp)
fp.close()
if 'node' in json_structure and 'instaloader' in json_structure and \
'node_type' in json_structure['instaloader']:
node_type = json_structure['instaloader']['node_type']
if node_type == "Post":
return Post(context, json_structure['node'])
elif node_type == "Profile":
return Profile(context, json_structure['node'])
elif node_type == "StoryItem":
return StoryItem(context, json_structure['node'])
else:
raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))
elif 'shortcode' in json_structure:
# Post JSON created with Instaloader v3
return Post.from_shortcode(context, json_structure['shortcode'])
else:
raise InvalidArgumentException("{}: Not an Instaloader JSON.".format(filename))