From b2fa1495984165049369b1a378065951495f4450 Mon Sep 17 00:00:00 2001 From: Achim Date: Thu, 14 Nov 2024 16:50:06 +0100 Subject: [PATCH 1/3] fix imagechest extractor --- gallery_dl/extractor/imagechest.py | 71 +++++++++++++++++++++--------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 115fff32..3ee91ba4 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -11,6 +11,7 @@ from .common import GalleryExtractor, Extractor, Message from .. import text, exception +import json BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgchest\.com" @@ -36,32 +37,60 @@ class ImagechestGalleryExtractor(GalleryExtractor): self.images = self._images_api def metadata(self, page): - if "Sorry, but the page you requested could not be found." in page: + if "Not Found" in page: raise exception.NotFoundError("gallery") - return { - "gallery_id": self.gallery_id, - "title": text.unescape(text.extr( - page, 'property="og:title" content="', '"').strip()) + page_data = self._retrieve_page_data(page) + + metadata = { + "gallery_id": self.gallery_id } - def images(self, page): - if ' load-all">' in page: - url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) - headers = { - "X-Requested-With": "XMLHttpRequest", - "Origin" : self.root, - "Referer" : self.gallery_url, - } - csrf_token = text.extr(page, 'name="csrf-token" content="', '"') - data = {"_token": csrf_token} - page += self.request( - url, method="POST", headers=headers, data=data).text + for attribute in [ + "id", + "slug", + "status", + "title", + "nsfw", + "score", + "comments", + "upvotes", + "downvotes", + "favorites", + "views", + "created" + ]: + try: + metadata[attribute] = page_data["props"]["post"][attribute] + except Exception: + pass - return [ - (url, None) - for url in text.extract_iter(page, 'data-url="', '"') - ] + try: + metadata["tags"] = ",".join(page_data["props"]["post"]["tags"]) + except Exception: + pass + + return metadata + + def images(self, page): + try: + return [ + (file["link"], None) + for file in self._retrieve_page_data(page)["props"]["post"]["files"] + ] + except Exception: + return [] + + def _retrieve_page_data(self, page): + return json.loads( + text.unescape( + text.extr( + page, + begin='data-page="', + end='"', + default='{}') + ) + ) def _metadata_api(self, page): post = self.api.post(self.gallery_id) From 917e873c630e6b38abf19757836d7fe514278371 Mon Sep 17 00:00:00 2001 From: Achim Date: Thu, 14 Nov 2024 16:54:59 +0100 Subject: [PATCH 2/3] fix imagechest extractor --- gallery_dl/extractor/imagechest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index 3ee91ba4..abd14900 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -73,10 +73,12 @@ class ImagechestGalleryExtractor(GalleryExtractor): return metadata def images(self, page): + page_data = self._retrieve_page_data(page) + try: return [ (file["link"], None) - for file in self._retrieve_page_data(page)["props"]["post"]["files"] + for file in page_data["props"]["post"]["files"] ] except Exception: return [] From 75612997febf3616cb2221377c061ba877bae768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mike=20F=C3=A4hrmann?= Date: Sat, 16 Nov 2024 09:17:13 +0100 Subject: [PATCH 3/3] [imagechest] simplify and fix user pagination end condition --- gallery_dl/extractor/imagechest.py | 91 ++++++++---------------------- 1 file changed, 25 insertions(+), 66 deletions(-) diff --git a/gallery_dl/extractor/imagechest.py b/gallery_dl/extractor/imagechest.py index abd14900..159feba0 100644 --- a/gallery_dl/extractor/imagechest.py +++ b/gallery_dl/extractor/imagechest.py @@ -10,8 +10,7 @@ """Extractors for https://imgchest.com/""" from .common import GalleryExtractor, Extractor, Message -from .. import text, exception -import json +from .. import text, util, exception BASE_PATTERN = r"(?:https?://)?(?:www\.)?imgchest\.com" @@ -34,65 +33,23 @@ class ImagechestGalleryExtractor(GalleryExtractor): self.api = ImagechestAPI(self, access_token) self.gallery_url = None self.metadata = self._metadata_api - self.images = self._images_api def metadata(self, page): - if "Not Found" in page: - raise exception.NotFoundError("gallery") - - page_data = self._retrieve_page_data(page) - - metadata = { - "gallery_id": self.gallery_id - } - - for attribute in [ - "id", - "slug", - "status", - "title", - "nsfw", - "score", - "comments", - "upvotes", - "downvotes", - "favorites", - "views", - "created" - ]: - try: - metadata[attribute] = page_data["props"]["post"][attribute] - except Exception: - pass - try: - metadata["tags"] = ",".join(page_data["props"]["post"]["tags"]) + data = util.json_loads(text.unescape(text.extr( + page, 'data-page="', '"'))) + post = data["props"]["post"] except Exception: - pass + if "Not Found" in page: + raise exception.NotFoundError("gallery") + self.files = () + return {} - return metadata + self.files = post.pop("files", ()) + post["gallery_id"] = self.gallery_id + post["tags"] = [tag["name"] for tag in post["tags"]] - def images(self, page): - page_data = self._retrieve_page_data(page) - - try: - return [ - (file["link"], None) - for file in page_data["props"]["post"]["files"] - ] - except Exception: - return [] - - def _retrieve_page_data(self, page): - return json.loads( - text.unescape( - text.extr( - page, - begin='data-page="', - end='"', - default='{}') - ) - ) + return post def _metadata_api(self, page): post = self.api.post(self.gallery_id) @@ -105,15 +62,18 @@ class ImagechestGalleryExtractor(GalleryExtractor): post["gallery_id"] = self.gallery_id post.pop("image_count", None) - self._image_list = post.pop("images") + self.files = post.pop("images") return post - def _images_api(self, page): - return [ - (img["link"], img) - for img in self._image_list - ] + def images(self, page): + try: + return [ + (file["link"], file) + for file in self.files + ] + except Exception: + return () class ImagechestUserExtractor(Extractor): @@ -124,10 +84,6 @@ class ImagechestUserExtractor(Extractor): pattern = BASE_PATTERN + r"/u/([^/?#]+)" example = "https://imgchest.com/u/USER" - def __init__(self, match): - Extractor.__init__(self, match) - self.user = match.group(1) - def items(self): url = self.root + "/api/posts" params = { @@ -135,7 +91,7 @@ class ImagechestUserExtractor(Extractor): "sort" : "new", "tag" : "", "q" : "", - "username": text.unquote(self.user), + "username": text.unquote(self.groups[0]), "nsfw" : "true", } @@ -145,6 +101,9 @@ class ImagechestUserExtractor(Extractor): except (TypeError, KeyError): return + if not data: + return + for gallery in data: gallery["_extractor"] = ImagechestGalleryExtractor yield Message.Queue, gallery["link"], gallery