1
0
mirror of https://github.com/instaloader/instaloader.git synced 2024-10-05 14:57:08 +02:00

Save metadata JSON with --metadata-json

With --metadata-json, a JSON file for each post is created saving the
Post properties defined in instaloader.Post class, i.e. caption, number
of likes, people tagged in caption or the picture itself, etc.

This closes #33 and closes #47.
This commit is contained in:
Alexander Graf 2017-09-29 14:02:58 +02:00
parent e471bd5ad3
commit e0ed4cf16c
3 changed files with 57 additions and 15 deletions

View File

@ -70,10 +70,13 @@ Instaloader supports the following targets:
Instaloader goes through all media matching the specified targets and
downloads the pictures and videos and their captions. You can specify
- :option:`--comments`, to also **download comments** of each post and
- :option:`--comments`, to also **download comments** of each post,
- :option:`--geotags`, to **download geotags** of each post and save them as
Google Maps link.
Google Maps link,
- :option:`--metadata-json`, to store further post metadata in a separate JSON
file.
.. _filename-specification:

View File

@ -56,6 +56,13 @@ Instead of a *profile* or a *#hashtag*, the special targets
Also **download stories** of each profile that is downloaded. Requires
:option:`--login`.
.. option:: --metadata-json
Create a JSON file containing the metadata of each post. This does not
include comments (see :option:`--comments`) nor geotags (see
:option:`--geotags`). The JSON files contain the properties of
:class:`instaloader.Post`.
.. option:: --stories-only
Rather than downloading regular posts of each specified profile, only

View File

@ -170,8 +170,8 @@ class Post:
metadata, if required. This class unifies access to the properties associated with a post. It implements == and is
hashable.
The properties defined here are accessable by the filter expressions specified with the :option:`--only-if`
parameter.
The properties defined here are accessible by the filter expressions specified with the :option:`--only-if`
parameter and exported into JSON files with :option:`--metadata-json`.
"""
LOGIN_REQUIRING_PROPERTIES = ["viewer_has_liked"]
@ -361,6 +361,22 @@ class Post:
params={'__a': 1})
return location_json["location"]
@staticmethod
def json_encoder(obj) -> Dict[str, Any]:
"""Convert instance of :class:`Post` to a JSON-serializable dictionary."""
if not isinstance(obj, Post):
raise TypeError("Object of type {} is not a Post object.".format(obj.__class__.__name__))
jsondict = {}
for prop in dir(Post):
if prop[0].isupper() or prop[0] == '_':
# skip uppercase and private properties
continue
val = obj.__getattribute__(prop)
if val is True or val is False or isinstance(val, (str, int, float, list)):
jsondict[prop] = val
elif isinstance(val, datetime):
jsondict[prop] = val.isoformat()
return jsondict
class Tristate(Enum):
"""Tri-state to encode whether we should save certain information, i.e. videos, captions, comments or geotags.
@ -387,8 +403,9 @@ class Instaloader:
filename_pattern: Optional[str] = None,
download_videos: Tristate = Tristate.always,
download_geotags: Tristate = Tristate.no_extra_query,
download_captions: Tristate = Tristate.no_extra_query,
download_comments: Tristate = Tristate.no_extra_query):
save_captions: Tristate = Tristate.no_extra_query,
download_comments: Tristate = Tristate.no_extra_query,
save_metadata: Tristate = Tristate.never):
# configuration parameters
self.user_agent = user_agent if user_agent is not None else default_user_agent()
@ -401,16 +418,15 @@ class Instaloader:
if filename_pattern is not None else '{date:%Y-%m-%d_%H-%M-%S}'
self.download_videos = download_videos
self.download_geotags = download_geotags
self.download_captions = download_captions
self.save_captions = save_captions
self.download_comments = download_comments
self.previous_queries = dict()
self.save_metadata = save_metadata
# error log, filled with error() and printed at the end of Instaloader.main()
self.error_log = []
# For the adaption of sleep intervals (rate control)
self.request_count = 0
self.last_request_time = 0
self.previous_queries = dict()
@property
def is_logged_in(self) -> bool:
@ -423,7 +439,7 @@ class Instaloader:
new_loader = Instaloader(self.sleep, self.quiet, self.user_agent,
self.dirname_pattern, self.filename_pattern,
self.download_videos, self.download_geotags,
self.download_captions, self.download_comments)
self.save_captions, self.download_comments)
new_loader.previous_queries = self.previous_queries
yield new_loader
self.error_log.extend(new_loader.error_log)
@ -688,6 +704,12 @@ class Instaloader:
os.utime(filename, (datetime.now().timestamp(), mtime.timestamp()))
return True
def save_metadata_json(self, filename: str, post: Post) -> None:
"""Saves metadata JSON file of a :class:`Post`."""
filename += '.json'
json.dump(post, fp=open(filename, 'w'), indent=4, default=Post.json_encoder)
self._log('json', end=' ', flush=True)
def update_comments(self, filename: str, post: Post) -> None:
filename += '_comments.json'
try:
@ -885,7 +907,7 @@ class Instaloader:
downloaded = False
# Save caption if desired
if self.download_captions is not Tristate.never:
if self.save_captions is not Tristate.never:
if post.caption:
self.save_caption(filename, post.date, post.caption)
else:
@ -905,6 +927,11 @@ class Instaloader:
if self.download_comments is Tristate.always:
self.update_comments(filename, post)
# Save metadata as JSON if desired. It might require an extra query, depending on which information has been
# already obtained. Regarding Tristate interpretation, we always assume that it requires an extra query.
if self.save_metadata is Tristate.always:
self.save_metadata_json(filename, post)
self._log()
return downloaded
@ -989,7 +1016,7 @@ class Instaloader:
self._log("Warning: Unable to find story image.")
downloaded = False
if "caption" in item and item["caption"] is not None and \
self.download_captions is not Tristate.never:
self.save_captions is not Tristate.never:
caption = item["caption"]
if isinstance(caption, dict) and "text" in caption:
caption = caption["text"]
@ -1385,6 +1412,9 @@ def main():
'server for each post, which is why it is disabled by default.')
g_what.add_argument('--no-captions', action='store_true',
help='Do not store media captions, although no additional request is needed to obtain them.')
g_what.add_argument('--metadata-json', action='store_true',
help='Create a JSON file containing the metadata of each post. This does not include comments '
'nor geotags.')
g_what.add_argument('-s', '--stories', action='store_true',
help='Also download stories of each profile that is downloaded. Requires --login.')
g_what.add_argument('--stories-only', action='store_true',
@ -1458,7 +1488,8 @@ def main():
download_videos = Tristate.always if not args.no_videos else Tristate.no_extra_query
download_comments = Tristate.always if args.comments else Tristate.no_extra_query
download_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
save_captions = Tristate.no_extra_query if not args.no_captions else Tristate.never
save_metadata = Tristate.always if args.metadata_json else Tristate.never
if args.geotags and args.no_geotags:
raise SystemExit("--geotags and --no-geotags given. I am confused and refuse to work.")
@ -1473,7 +1504,8 @@ def main():
user_agent=args.user_agent,
dirname_pattern=args.dirname_pattern, filename_pattern=args.filename_pattern,
download_videos=download_videos, download_geotags=download_geotags,
download_captions=download_captions, download_comments=download_comments)
save_captions=save_captions, download_comments=download_comments,
save_metadata=save_metadata)
loader.main(args.profile, args.login.lower() if args.login is not None else None, args.password,
args.sessionfile,
int(args.count) if args.count is not None else None,