diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 6f9b317f..36ab9534 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -20,19 +20,36 @@ jobs: steps: - uses: actions/checkout@v3 + - name: Check file permissions + run: | + if [[ "$(find ./gallery_dl -type f -not -perm 644)" ]]; then exit 1; fi + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} - name: Install dependencies - env: - PYV: ${{ matrix.python-version }} run: | pip install -r requirements.txt pip install "flake8<4" "importlib-metadata<5" pip install youtube-dl - if [[ "$PYV" != "3.4" && "$PYV" != "3.5" ]]; then pip install yt-dlp; fi + + - name: Install yt-dlp + run: | + case "${{ matrix.python-version }}" in + 3.4|3.5) + # don't install yt-dlp + ;; + 3.6) + # install from PyPI + pip install yt-dlp + ;; + *) + # install from master + pip install https://github.com/yt-dlp/yt-dlp/archive/refs/heads/master.tar.gz + ;; + esac - name: Lint with flake8 run: | diff --git a/CHANGELOG.md b/CHANGELOG.md index a67e3abb..53034fa5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,177 @@ # Changelog +## 1.25.8 - 2023-07-15 +### Changes +- update default User-Agent header to Firefox 115 ESR +### Additions +- [gfycat] support `@me` user ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271)) +- [gfycat] implement login support ([#3770](https://github.com/mikf/gallery-dl/issues/3770), [#4271](https://github.com/mikf/gallery-dl/issues/4271)) +- [reddit] notify users about registering an OAuth application ([#4292](https://github.com/mikf/gallery-dl/issues/4292)) +- [twitter] add `ratelimit` option ([#4251](https://github.com/mikf/gallery-dl/issues/4251)) +- [twitter] use `TweetResultByRestId` endpoint that allows accessing single Tweets without login ([#4250](https://github.com/mikf/gallery-dl/issues/4250)) +### Fixes +- [bunkr] use `.la` TLD for `media-files12` servers ([#4147](https://github.com/mikf/gallery-dl/issues/4147), [#4276](https://github.com/mikf/gallery-dl/issues/4276)) +- [erome] ignore duplicate album IDs +- [fantia] send `X-Requested-With` header ([#4273](https://github.com/mikf/gallery-dl/issues/4273)) +- [gelbooru_v01] fix `source` metadata ([#4302](https://github.com/mikf/gallery-dl/issues/4302), [#4303](https://github.com/mikf/gallery-dl/issues/4303)) +- [gelbooru_v01] update `vidyart` domain +- [jpgfish] update domain to `jpeg.pet` +- [mangaread] fix `tags` metadata extraction +- [naverwebtoon] fix `comic` metadata extraction +- [newgrounds] extract & pass auth token during login ([#4268](https://github.com/mikf/gallery-dl/issues/4268)) +- [paheal] fix extraction ([#4262](https://github.com/mikf/gallery-dl/issues/4262), [#4293](https://github.com/mikf/gallery-dl/issues/4293)) +- [paheal] unescape `source` +- [philomena] fix `--range` ([#4288](https://github.com/mikf/gallery-dl/issues/4288)) +- [philomena] handle `429 Too Many Requests` errors ([#4288](https://github.com/mikf/gallery-dl/issues/4288)) +- [pornhub] set `accessAgeDisclaimerPH` cookie ([#4301](https://github.com/mikf/gallery-dl/issues/4301)) +- [reddit] use 0.6s delay between API requests ([#4292](https://github.com/mikf/gallery-dl/issues/4292)) +- [seiga] set `skip_fetish_warning` cookie ([#4242](https://github.com/mikf/gallery-dl/issues/4242)) +- [slideshare] fix extraction +- [twitter] fix `following` extractor not getting all users ([#4287](https://github.com/mikf/gallery-dl/issues/4287)) +- [twitter] use GraphQL search endpoint by default ([#4264](https://github.com/mikf/gallery-dl/issues/4264)) +- [twitter] do not treat missing `TimelineAddEntries` instruction as fatal ([#4278](https://github.com/mikf/gallery-dl/issues/4278)) +- [weibo] fix cursor based pagination +- [wikifeet] fix `tag` extraction ([#4289](https://github.com/mikf/gallery-dl/issues/4289), [#4291](https://github.com/mikf/gallery-dl/issues/4291)) +### Removals +- [bcy] remove module +- [lineblog] remove module + +## 1.25.7 - 2023-07-02 +### Additions +- [flickr] add 'exif' option +- [flickr] add 'metadata' option ([#4227](https://github.com/mikf/gallery-dl/issues/4227)) +- [mangapark] add 'source' option ([#3969](https://github.com/mikf/gallery-dl/issues/3969)) +- [twitter] extend 'conversations' option ([#4211](https://github.com/mikf/gallery-dl/issues/4211)) +### Fixes +- [furaffinity] improve 'description' HTML ([#4224](https://github.com/mikf/gallery-dl/issues/4224)) +- [gelbooru_v01] fix '--range' ([#4167](https://github.com/mikf/gallery-dl/issues/4167)) +- [hentaifox] fix titles containing '@' ([#4201](https://github.com/mikf/gallery-dl/issues/4201)) +- [mangapark] update to v5 ([#3969](https://github.com/mikf/gallery-dl/issues/3969)) +- [piczel] update API server address ([#4244](https://github.com/mikf/gallery-dl/issues/4244)) +- [poipiku] improve error detection ([#4206](https://github.com/mikf/gallery-dl/issues/4206)) +- [sankaku] improve warnings for unavailable posts +- [senmanga] ensure download URLs have a scheme ([#4235](https://github.com/mikf/gallery-dl/issues/4235)) + +## 1.25.6 - 2023-06-17 +### Additions +- [blogger] download files from `lh*.googleusercontent.com` ([#4070](https://github.com/mikf/gallery-dl/issues/4070)) +- [fantia] extract `plan` metadata ([#2477](https://github.com/mikf/gallery-dl/issues/2477)) +- [fantia] emit warning for non-visible content sections ([#4128](https://github.com/mikf/gallery-dl/issues/4128)) +- [furaffinity] extract `favorite_id` metadata ([#4133](https://github.com/mikf/gallery-dl/issues/4133)) +- [jschan] add generic extractors for jschan image boards ([#3447](https://github.com/mikf/gallery-dl/issues/3447)) +- [kemonoparty] support `.su` TLDs ([#4139](https://github.com/mikf/gallery-dl/issues/4139)) +- [pixiv:novel] add `novel-bookmark` extractor ([#4111](https://github.com/mikf/gallery-dl/issues/4111)) +- [pixiv:novel] add `full-series` option ([#4111](https://github.com/mikf/gallery-dl/issues/4111)) +- [postimage] add gallery support, update image extractor ([#3115](https://github.com/mikf/gallery-dl/issues/3115), [#4134](https://github.com/mikf/gallery-dl/issues/4134)) +- [redgifs] support galleries ([#4021](https://github.com/mikf/gallery-dl/issues/4021)) +- [twitter] extract `conversation_id` metadata ([#3839](https://github.com/mikf/gallery-dl/issues/3839)) +- [vipergirls] add login support ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +- [vipergirls] use API endpoints ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +- [formatter] implement `H` conversion ([#4164](https://github.com/mikf/gallery-dl/issues/4164)) +### Fixes +- [acidimg] fix extraction ([#4136](https://github.com/mikf/gallery-dl/issues/4136)) +- [bunkr] update domain to bunkrr.su ([#4159](https://github.com/mikf/gallery-dl/issues/4159), [#4189](https://github.com/mikf/gallery-dl/issues/4189)) +- [bunkr] fix video downloads +- [fanbox] prevent exception due to missing embeds ([#4088](https://github.com/mikf/gallery-dl/issues/4088)) +- [instagram] fix retrieving `/tagged` posts ([#4122](https://github.com/mikf/gallery-dl/issues/4122)) +- [jpgfish] update domain to `jpg.pet` ([#4138](https://github.com/mikf/gallery-dl/issues/4138)) +- [pixiv:novel] fix error with embeds extraction ([#4175](https://github.com/mikf/gallery-dl/issues/4175)) +- [pornhub] improve redirect handling ([#4188](https://github.com/mikf/gallery-dl/issues/4188)) +- [reddit] fix crash due to empty `crosspost_parent_lists` ([#4120](https://github.com/mikf/gallery-dl/issues/4120), [#4172](https://github.com/mikf/gallery-dl/issues/4172)) +- [redgifs] update `search` URL pattern ([#4115](https://github.com/mikf/gallery-dl/issues/4115), [#4185](https://github.com/mikf/gallery-dl/issues/4185)) +- [senmanga] fix and update ([#4160](https://github.com/mikf/gallery-dl/issues/4160)) +- [twitter] use GraphQL API search endpoint ([#3942](https://github.com/mikf/gallery-dl/issues/3942)) +- [wallhaven] improve HTTP error handling ([#4192](https://github.com/mikf/gallery-dl/issues/4192)) +- [weibo] prevent fatal exception due to missing video data ([#4150](https://github.com/mikf/gallery-dl/issues/4150)) +- [weibo] fix `.json` extension for some videos + +## 1.25.5 - 2023-05-27 +### Additions +- [8muses] add `parts` metadata field ([#3329](https://github.com/mikf/gallery-dl/issues/3329)) +- [danbooru] add `date` metadata field ([#4047](https://github.com/mikf/gallery-dl/issues/4047)) +- [e621] add `date` metadata field ([#4047](https://github.com/mikf/gallery-dl/issues/4047)) +- [gofile] add basic password support ([#4056](https://github.com/mikf/gallery-dl/issues/4056)) +- [imagechest] implement API support ([#4065](https://github.com/mikf/gallery-dl/issues/4065)) +- [instagram] add `order-files` option ([#3993](https://github.com/mikf/gallery-dl/issues/3993), [#4017](https://github.com/mikf/gallery-dl/issues/4017)) +- [instagram] add `order-posts` option ([#3993](https://github.com/mikf/gallery-dl/issues/3993), [#4017](https://github.com/mikf/gallery-dl/issues/4017)) +- [instagram] add `metadata` option ([#3107](https://github.com/mikf/gallery-dl/issues/3107)) +- [jpgfish] add `jpg.fishing` extractors ([#2657](https://github.com/mikf/gallery-dl/issues/2657), [#2719](https://github.com/mikf/gallery-dl/issues/2719)) +- [lensdump] add `lensdump.com` extractors ([#2078](https://github.com/mikf/gallery-dl/issues/2078), [#4104](https://github.com/mikf/gallery-dl/issues/4104)) +- [mangaread] add `mangaread.org` extractors ([#2425](https://github.com/mikf/gallery-dl/issues/2425), [#2781](https://github.com/mikf/gallery-dl/issues/2781)) +- [misskey] add `favorite` extractor ([#3950](https://github.com/mikf/gallery-dl/issues/3950)) +- [pixiv] add `novel` support ([#1241](https://github.com/mikf/gallery-dl/issues/1241), [#4044](https://github.com/mikf/gallery-dl/issues/4044)) +- [reddit] support cross-posted media ([#887](https://github.com/mikf/gallery-dl/issues/887), [#3586](https://github.com/mikf/gallery-dl/issues/3586), [#3976](https://github.com/mikf/gallery-dl/issues/3976)) +- [postprocessor:exec] support tilde expansion for `command` +- [formatter] support slicing strings as bytes ([#4087](https://github.com/mikf/gallery-dl/issues/4087)) +### Fixes +- [8muses] fix value of `album[url]` ([#3329](https://github.com/mikf/gallery-dl/issues/3329)) +- [danbooru] refactor pagination logic ([#4002](https://github.com/mikf/gallery-dl/issues/4002)) +- [fanbox] skip invalid posts ([#4088](https://github.com/mikf/gallery-dl/issues/4088)) +- [gofile] automatically fetch `website-token` +- [kemonoparty] fix kemono and coomer logins sharing the same cache ([#4098](https://github.com/mikf/gallery-dl/issues/4098)) +- [newgrounds] add default delay between requests ([#4046](https://github.com/mikf/gallery-dl/issues/4046)) +- [nsfwalbum] detect placeholder images +- [poipiku] extract full `descriptions` ([#4066](https://github.com/mikf/gallery-dl/issues/4066)) +- [tcbscans] update domain to `tcbscans.com` ([#4080](https://github.com/mikf/gallery-dl/issues/4080)) +- [twitter] extract TwitPic URLs in text ([#3792](https://github.com/mikf/gallery-dl/issues/3792), [#3796](https://github.com/mikf/gallery-dl/issues/3796)) +- [weibo] require numeric IDs to have length >= 10 ([#4059](https://github.com/mikf/gallery-dl/issues/4059)) +- [ytdl] fix crash due to removed `no_color` attribute +- [cookies] improve logging behavior ([#4050](https://github.com/mikf/gallery-dl/issues/4050)) + +## 1.25.4 - 2023-05-07 +### Additions +- [4chanarchives] add `thread` and `board` extractors ([#4012](https://github.com/mikf/gallery-dl/issues/4012)) +- [foolfuuka] add `archive.palanq.win` +- [imgur] add `favorite-folder` extractor ([#4016](https://github.com/mikf/gallery-dl/issues/4016)) +- [mangadex] add `status` and `tags` metadata ([#4031](https://github.com/mikf/gallery-dl/issues/4031)) +- allow selecting a domain with `--cookies-from-browser` +- add `--cookies-export` command-line option +- add `-C` as short option for `--cookies` +- include exception type in config error messages +### Fixes +- [exhentai] update sadpanda check +- [imagechest] load all images when a "Load More" button is present ([#4028](https://github.com/mikf/gallery-dl/issues/4028)) +- [imgur] fix bug causing some images/albums from user profiles and favorites to be ignored +- [pinterest] update endpoint for related board pins +- [pinterest] fix `pin.it` extractor +- [ytdl] fix yt-dlp `--xff/--geo-bypass` tests ([#3989](https://github.com/mikf/gallery-dl/issues/3989)) +### Removals +- [420chan] remove module +- [foolfuuka] remove `archive.alice.al` and `tokyochronos.net` +- [foolslide] remove `sensescans.com` +- [nana] remove module + +## 1.25.3 - 2023-04-30 +### Additions +- [imagefap] extract `description` and `categories` metadata ([#3905](https://github.com/mikf/gallery-dl/issues/3905)) +- [imxto] add `gallery` extractor ([#1289](https://github.com/mikf/gallery-dl/issues/1289)) +- [itchio] add `game` extractor ([#3923](https://github.com/mikf/gallery-dl/issues/3923)) +- [nitter] extract user IDs from encoded banner URLs +- [pixiv] allow sorting search results by popularity ([#3970](https://github.com/mikf/gallery-dl/issues/3970)) +- [reddit] match `preview.redd.it` URLs ([#3935](https://github.com/mikf/gallery-dl/issues/3935)) +- [sankaku] support post URLs with MD5 hashes ([#3952](https://github.com/mikf/gallery-dl/issues/3952)) +- [shimmie2] add generic extractors for Shimmie2 sites ([#3734](https://github.com/mikf/gallery-dl/issues/3734), [#943](https://github.com/mikf/gallery-dl/issues/943)) +- [tumblr] add `day` extractor ([#3951](https://github.com/mikf/gallery-dl/issues/3951)) +- [twitter] support `profile-conversation` entries ([#3938](https://github.com/mikf/gallery-dl/issues/3938)) +- [vipergirls] add `thread` and `post` extractors ([#3812](https://github.com/mikf/gallery-dl/issues/3812), [#2720](https://github.com/mikf/gallery-dl/issues/2720), [#731](https://github.com/mikf/gallery-dl/issues/731)) +- [downloader:http] add `consume-content` option ([#3748](https://github.com/mikf/gallery-dl/issues/3748)) +### Fixes +- [2chen] update domain to sturdychan.help +- [behance] fix extraction ([#3980](https://github.com/mikf/gallery-dl/issues/3980)) +- [deviantart] retry downloads with private token ([#3941](https://github.com/mikf/gallery-dl/issues/3941)) +- [imagefap] fix empty `tags` metadata +- [manganelo] support arbitrary minor version separators ([#3972](https://github.com/mikf/gallery-dl/issues/3972)) +- [nozomi] fix file URLs ([#3925](https://github.com/mikf/gallery-dl/issues/3925)) +- [oauth] catch exceptions from `webbrowser.get()` ([#3947](https://github.com/mikf/gallery-dl/issues/3947)) +- [pixiv] fix `pixivision` extraction +- [reddit] ignore `id-max` value `"zik0zj"`/`2147483647` ([#3939](https://github.com/mikf/gallery-dl/issues/3939), [#3862](https://github.com/mikf/gallery-dl/issues/3862), [#3697](https://github.com/mikf/gallery-dl/issues/3697), [#3606](https://github.com/mikf/gallery-dl/issues/3606), [#3546](https://github.com/mikf/gallery-dl/issues/3546), [#3521](https://github.com/mikf/gallery-dl/issues/3521), [#3412](https://github.com/mikf/gallery-dl/issues/3412)) +- [sankaku] sanitize `date:` tags ([#1790](https://github.com/mikf/gallery-dl/issues/1790)) +- [tumblr] fix and update pagination logic ([#2191](https://github.com/mikf/gallery-dl/issues/2191)) +- [twitter] fix `user` metadata when downloading quoted Tweets ([#3922](https://github.com/mikf/gallery-dl/issues/3922)) +- [ytdl] fix crash due to `--geo-bypass` deprecation ([#3975](https://github.com/mikf/gallery-dl/issues/3975)) +- [postprocessor:metadata] support putting keys in quotes +- include more optional dependencies in executables ([#3907](https://github.com/mikf/gallery-dl/issues/3907)) + ## 1.25.2 - 2023-04-15 ### Additions - [deviantart] add `public` option diff --git a/README.rst b/README.rst index c3d09ff1..51e239c1 100644 --- a/README.rst +++ b/README.rst @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds @@ -123,6 +123,15 @@ For macOS or Linux users using Homebrew: brew install gallery-dl +MacPorts +-------- + +For macOS users with MacPorts: + +.. code:: bash + + sudo port install gallery-dl + Usage ===== diff --git a/docs/configuration.rst b/docs/configuration.rst index f5652b77..cc2bbc90 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -382,6 +382,7 @@ Description * ``e621`` (*) * ``e926`` (*) * ``exhentai`` + * ``gfycat`` * ``idolcomplex`` * ``imgbb`` * ``inkbunny`` @@ -395,6 +396,7 @@ Description * ``tapas`` * ``tsumino`` * ``twitter`` + * ``vipergirls`` * ``zerochan`` These values can also be specified via the @@ -440,30 +442,35 @@ Description "isAdult" : "1" } - * A ``list`` with up to 4 entries specifying a browser profile. + * A ``list`` with up to 5 entries specifying a browser profile. * The first entry is the browser name * The optional second entry is a profile name or an absolute path to a profile directory * The optional third entry is the keyring to retrieve passwords for decrypting cookies from * The optional fourth entry is a (Firefox) container name (``"none"`` for only cookies with no container) + * The optional fifth entry is the domain to extract cookies for. Prefix it with a dot ``.`` to include cookies for subdomains. Has no effect when also specifying a container. .. code:: json ["firefox"] ["firefox", null, null, "Personal"] - ["chromium", "Private", "kwallet"] + ["chromium", "Private", "kwallet", null, ".twitter.com"] extractor.*.cookies-update -------------------------- Type - ``bool`` + * ``bool`` + * |Path|_ Default ``true`` Description - If `extractor.*.cookies`_ specifies the |Path|_ of a cookies.txt - file and it can be opened and parsed without errors, - update its contents with cookies received during data extraction. + Export session cookies in cookies.txt format. + + * If this is a |Path|_, write cookies to the given file path. + + * If this is ``true`` and `extractor.*.cookies`_ specifies the |Path|_ + of a valid cookies.txt file, update its contents. extractor.*.proxy @@ -519,7 +526,7 @@ extractor.*.user-agent Type ``string`` Default - ``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0"`` + ``"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0"`` Description User-Agent header value to be used for HTTP requests. @@ -1151,7 +1158,7 @@ Description Note: This requires 1 additional HTTP request per 200-post batch. -extractor.{Danbooru].threshold +extractor.[Danbooru].threshold ------------------------------ Type * ``string`` @@ -1535,6 +1542,39 @@ Description from `linking your Flickr account to gallery-dl `__. +extractor.flickr.exif +--------------------- +Type + ``bool`` +Default + ``false`` +Description + Fetch `exif` and `camera` metadata for each photo. + + Note: This requires 1 additional API call per photo. + + +extractor.flickr.metadata +------------------------- +Type + * ``bool`` + * ``string`` + * ``list`` of ``strings`` +Default + ``false`` +Example + * ``license,last_update,machine_tags`` + * ``["license", "last_update", "machine_tags"]`` +Description + Extract additional metadata + (license, date_taken, original_format, last_update, geo, machine_tags, o_dims) + + It is possible to specify a custom list of metadata includes. + See `the extras parameter `__ + in `Flickr API docs `__ + for possible field names. + + extractor.flickr.videos ----------------------- Type @@ -1651,7 +1691,11 @@ Default ``["mp4", "webm", "mobile", "gif"]`` Description List of names of the preferred animation format, which can be - ``"mp4"``, ``"webm"``, ``"mobile"``, ``"gif"``, or ``"webp"``. + ``"mp4"``, + ``"webm"``, + ``"mobile"``, + ``"gif"``, or + ``"webp"``. If a selected format is not available, the next one in the list will be tried until an available format is found. @@ -1677,15 +1721,14 @@ extractor.gofile.website-token ------------------------------ Type ``string`` -Default - ``"12345"`` Description API token value used during API requests. - A not up-to-date value will result in ``401 Unauthorized`` errors. + An invalid or not up-to-date value + will result in ``401 Unauthorized`` errors. - Setting this value to ``null`` will do an extra HTTP request to fetch - the current value used by gofile. + Keeping this option unset will use an extra HTTP request + to attempt to fetch the current value used by gofile. extractor.gofile.recursive @@ -1733,6 +1776,21 @@ Description but is most likely going to fail with ``403 Forbidden`` errors. +extractor.imagechest.access-token +--------------------------------- +Type + ``string`` +Description + Your personal Image Chest access token. + + These tokens allow using the API instead of having to scrape HTML pages, + providing more detailed metadata. + (``date``, ``description``, etc) + + See https://imgchest.com/docs/api/1.0/general/authorization + for instructions on how to generate such a token. + + extractor.imgur.client-id ------------------------- Type @@ -1808,6 +1866,55 @@ Description It is possible to use ``"all"`` instead of listing all values separately. +extractor.instagram.metadata +---------------------------- +Type + ``bool`` +Default + ``false`` +Description + Provide extended ``user`` metadata even when referring to a user by ID, + e.g. ``instagram.com/id:12345678``. + + Note: This metadata is always available when referring to a user by name, + e.g. ``instagram.com/USERNAME``. + + +extractor.instagram.order-files +------------------------------- +Type + ``string`` +Default + ``"asc"`` +Description + Controls the order in which files of each post are returned. + + * ``"asc"``: Same order as displayed in a post + * ``"desc"``: Reverse order as displayed in a post + * ``"reverse"``: Same as ``"desc"`` + + Note: This option does *not* affect ``{num}``. + To enumerate files in reverse order, use ``count - num + 1``. + + +extractor.instagram.order-posts +------------------------------- +Type + ``string`` +Default + ``"asc"`` +Description + Controls the order in which posts are returned. + + * ``"asc"``: Same order as displayed + * ``"desc"``: Reverse order as displayed + * ``"id"`` or ``"id_asc"``: Ascending order by ID + * ``"id_desc"``: Descending order by ID + * ``"reverse"``: Same as ``"desc"`` + + Note: This option only affects ``highlights``. + + extractor.instagram.previews ---------------------------- Type @@ -1979,18 +2086,21 @@ Example Description Additional query parameters to send when fetching manga chapters. - (See `/manga/{id}/feed `_ - and `/user/follows/manga/feed `_) + (See `/manga/{id}/feed `__ + and `/user/follows/manga/feed `__) extractor.mangadex.lang ----------------------- Type - ``string`` + * ``string`` + * ``list`` of ``strings`` Example - ``"en"`` + * ``"en"`` + * ``"fr,it"`` + * ``["fr", "it"]`` Description - `ISO 639-1 `__ language code + `ISO 639-1 `__ language codes to filter chapters by. @@ -2004,6 +2114,24 @@ Description List of acceptable content ratings for returned chapters. +extractor.mangapark.source +-------------------------- +Type + * ``string`` + * ``integer`` +Example + * ``"koala:en"`` + * ``15150116`` +Description + Select chapter source and language for a manga. + + | The general syntax is ``":"``. + | Both are optional, meaning ``"koala"``, ``"koala:"``, ``":en"``, + or even just ``":"`` are possible as well. + + Specifying the numeric ``ID`` of a source is also supported. + + extractor.[mastodon].access-token --------------------------------- Type @@ -2050,8 +2178,16 @@ Description Also emit metadata for text-only posts without media content. +extractor.[misskey].access-token +-------------------------------- +Type + ``string`` +Description + Your access token, necessary to fetch favorited notes. + + extractor.[misskey].renotes ----------------------------- +--------------------------- Type ``bool`` Default @@ -2061,7 +2197,7 @@ Description extractor.[misskey].replies ----------------------------- +--------------------------- Type ``bool`` Default @@ -2070,17 +2206,6 @@ Description Fetch media from replies to other notes. -extractor.nana.favkey ---------------------- -Type - ``string`` -Default - ``null`` -Description - Your `Nana Favorite Key `__, - used to access your favorite archives. - - extractor.newgrounds.flash -------------------------- Type @@ -2341,7 +2466,12 @@ Description when processing a user profile. Possible values are - ``"artworks"``, ``"avatar"``, ``"background"``, ``"favorite"``. + ``"artworks"``, + ``"avatar"``, + ``"background"``, + ``"favorite"``, + ``"novel-user"``, + ``"novel-bookmark"``. It is possible to use ``"all"`` instead of listing all values separately. @@ -2357,6 +2487,27 @@ Description `gppt `__. +extractor.pixiv.embeds +---------------------- +Type + ``bool`` +Default + ``false`` +Description + Download images embedded in novels. + + +extractor.pixiv.novel.full-series +--------------------------------- +Type + ``bool`` +Default + ``false`` +Description + When downloading a novel being part of a series, + download all novels of that series. + + extractor.pixiv.metadata ------------------------ Type @@ -2602,7 +2753,12 @@ Default ``["hd", "sd", "gif"]`` Description List of names of the preferred animation format, which can be - ``"hd"``, ``"sd"``, `"gif"``, `"vthumbnail"``, `"thumbnail"``, or ``"poster"``. + ``"hd"``, + ``"sd"``, + ``"gif"``, + ``"thumbnail"``, + ``"vthumbnail"``, or + ``"poster"``. If a selected format is not available, the next one in the list will be tried until an available format is found. @@ -2901,15 +3057,19 @@ Description extractor.twitter.conversations ------------------------------- Type - ``bool`` + * ``bool`` + * ``string`` Default ``false`` Description For input URLs pointing to a single Tweet, e.g. `https://twitter.com/i/web/status/`, fetch media from all Tweets and replies in this `conversation - `__ - or thread. + `__. + + If this option is equal to ``"accessible"``, + only download from conversation Tweets + if the given initial Tweet is accessible. extractor.twitter.csrf @@ -2945,6 +3105,32 @@ Description `syndication `__ API. +extractor.twitter.include +------------------------- +Type + * ``string`` + * ``list`` of ``strings`` +Default + ``"timeline"`` +Example + * ``"avatar,background,media"`` + * ``["avatar", "background", "media"]`` +Description + A (comma-separated) list of subcategories to include + when processing a user profile. + + Possible values are + ``"avatar"``, + ``"background"``, + ``"timeline"``, + ``"tweets"``, + ``"media"``, + ``"replies"``, + ``"likes"``. + + It is possible to use ``"all"`` instead of listing all values separately. + + extractor.twitter.transform --------------------------- Type @@ -2955,6 +3141,20 @@ Description Transform Tweet and User metadata into a simpler, uniform format. +extractor.twitter.tweet-endpoint +-------------------------------- +Type + ``string`` +Default + ``"auto"`` +Description + Selects the API endpoint used to retrieve single Tweets. + + * ``"restid"``: ``/TweetResultByRestId`` - accessible to guest users + * ``"detail"``: ``/TweetDetail`` - more stable + * ``"auto"``: ``"detail"`` when logged in, ``"restid"`` otherwise + + extractor.twitter.size ---------------------- Type @@ -3027,6 +3227,19 @@ Description a quoted (original) Tweet when it sees the Tweet which quotes it. +extractor.twitter.ratelimit +--------------------------- +Type + ``string`` +Default + ``"wait"`` +Description + Selects how to handle exceeding the API rate limit. + + * ``"abort"``: Raise an error and stop extraction + * ``"wait"``: Wait until rate limit reset + + extractor.twitter.replies ------------------------- Type @@ -3067,8 +3280,8 @@ Type Default ``"auto"`` Description - Controls the strategy / tweet source used for user URLs - (``https://twitter.com/USER``). + Controls the strategy / tweet source used for timeline URLs + (``https://twitter.com/USER/timeline``). * ``"tweets"``: `/tweets `__ timeline + search * ``"media"``: `/media `__ timeline + search @@ -3637,6 +3850,25 @@ Description contains JPEG/JFIF data. +downloader.http.consume-content +------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Controls the behavior when an HTTP response is considered + unsuccessful + + If the value is ``true``, consume the response body. This + avoids closing the connection and therefore improves connection + reuse. + + If the value is ``false``, immediately close the connection + without reading the response. This can be useful if the server + is known to send large bodies for error responses. + + downloader.http.chunk-size -------------------------- Type @@ -4497,7 +4729,7 @@ Default Description Name of the metadata field whose value should be used. - This value must either be a UNIX timestamp or a + This value must be either a UNIX timestamp or a |datetime|_ object. Note: This option gets ignored if `mtime.value`_ is set. @@ -4515,10 +4747,54 @@ Example Description A `format string`_ whose value should be used. - The resulting value must either be a UNIX timestamp or a + The resulting value must be either a UNIX timestamp or a |datetime|_ object. +python.archive +-------------- +Type + |Path|_ +Description + File to store IDs of called Python functions in, + similar to `extractor.*.archive`_. + + ``archive-format``, ``archive-prefix``, and ``archive-pragma`` options, + akin to + `extractor.*.archive-format`_, + `extractor.*.archive-prefix`_, and + `extractor.*.archive-pragma`_, are supported as well. + + +python.event +------------ +Type + ``string`` +Default + ``"file"`` +Description + The event for which `python.function`_ gets called. + + See `metadata.event`_ for a list of available events. + + +python.function +--------------- +Type + ``string`` +Example + * ``"my_module:generate_text"`` + * ``"~/.local/share/gdl-utils.py:resize"`` +Description + The Python function to call. + + This function gets specified as ``:`` + and gets called with the current metadata dict as argument. + + ``module`` is either an importable Python module name + or the |Path|_ to a `.py` file, + + ugoira.extension ---------------- Type @@ -4836,17 +5112,6 @@ Description used for (urllib3) warnings. -pyopenssl ---------- -Type - ``bool`` -Default - ``false`` -Description - Use `pyOpenSSL `__-backed - SSL-support. - - API Tokens & IDs ================ @@ -4912,6 +5177,10 @@ How To ``user-agent`` and replace ```` and ```` accordingly (see Reddit's `API access rules `__) + * clear your `cache `__ to delete any remaining + ``access-token`` entries. (``gallery-dl --clear-cache reddit``) + * get a `refresh-token `__ for the + new ``client-id`` (``gallery-dl oauth:reddit``) extractor.smugmug.api-key & .api-secret @@ -5123,6 +5392,8 @@ Description Write metadata to separate files ``mtime`` Set file modification time according to its metadata + ``python`` + Call Python functions ``ugoira`` Convert Pixiv Ugoira to WebM using `FFmpeg `__ ``zip`` diff --git a/docs/formatting.md b/docs/formatting.md index cc2703d2..f188a538 100644 --- a/docs/formatting.md +++ b/docs/formatting.md @@ -11,14 +11,16 @@ Field names select the metadata value to use in a replacement field. While simple names are usually enough, more complex forms like accessing values by attribute, element index, or slicing are also supported. -| | Example | Result | -| -------------------- | ----------------- | ---------------------- | -| Name | `{title}` | `Hello World` | -| Element Index | `{title[6]}` | `W` | -| Slicing | `{title[3:8]}` | `lo Wo` | -| Alternatives | `{empty\|title}` | `Hello World` | -| Element Access | `{user[name]}` | `John Doe` | -| Attribute Access | `{extractor.url}` | `https://example.org/` | +| | Example | Result | +| -------------------- | ------------------- | ---------------------- | +| Name | `{title}` | `Hello World` | +| Element Index | `{title[6]}` | `W` | +| Slicing | `{title[3:8]}` | `lo Wo` | +| Slicing (Bytes) | `{title_ja[b3:18]}` | `ロー・ワー` | +| Alternatives | `{empty\|title}` | `Hello World` | +| Attribute Access | `{extractor.url}` | `https://example.org/` | +| Element Access | `{user[name]}` | `John Doe` | +| | `{user['name']}` | `John Doe` | All of these methods can be combined as needed. For example `{title[24]|empty|extractor.url[15:-1]}` would result in `.org`. @@ -92,6 +94,18 @@ Conversion specifiers allow to *convert* the value to a different form or type. {created!d} 2010-01-01 00:00:00 + + U + Convert HTML entities + {html!U} + <p>foo & bar</p> + + + H + Convert HTML entities & remove HTML tags + {html!H} + foo & bar + s Convert value to str @@ -150,6 +164,12 @@ Format specifiers can be used for advanced formatting by using the options provi {foo:[1:-1]} oo Ba + + [b<start>:<stop>] + Same as above, but applies to the bytes() representation of a string in filesystem encoding + {foo_ja:[b3:-1]} + ー・バ + L<maxlen>/<repl>/ Replaces the entire output with <repl> if its length exceeds <maxlen> @@ -193,7 +213,9 @@ Format specifiers can be used for advanced formatting by using the options provi -All special format specifiers (`?`, `L`, `J`, `R`, `D`, `O`) can be chained and combined with one another, but must always come before any standard format specifiers: +All special format specifiers (`?`, `L`, `J`, `R`, `D`, `O`, etc) +can be chained and combined with one another, +but must always appear before any standard format specifiers: For example `{foo:?//RF/B/Ro/e/> 10}` -> `   Bee Bar` - `?//` - Tests if `foo` has a value @@ -244,7 +266,7 @@ Replacement field names that are available in all format strings. ## Special Type Format Strings -Starting a format string with '\f ' allows to set a different format string type than the default. Available ones are: +Starting a format string with `\f ` allows to set a different format string type than the default. Available ones are: @@ -285,13 +307,3 @@ Starting a format string with '\f ' allows to set a different format strin
- -> **Note:** -> -> `\f` is the [Form Feed](https://en.wikipedia.org/w/index.php?title=Page_break&oldid=1027475805#Form_feed) -> character. (ASCII code 12 or 0xc) -> -> Writing it as `\f` is native to JSON, but will *not* get interpreted -> as such by most shells. To use this character there: -> * hold `Ctrl`, then press `v` followed by `l`, resulting in `^L` or -> * use `echo` or `printf` (e.g. `gallery-dl -f "$(echo -ne \\fM) my_module:generate_text"`) diff --git a/docs/gallery-dl.conf b/docs/gallery-dl.conf index 09d9e80a..b5efc734 100644 --- a/docs/gallery-dl.conf +++ b/docs/gallery-dl.conf @@ -10,7 +10,7 @@ "proxy": null, "skip": true, - "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:102.0) Gecko/20100101 Firefox/102.0", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:115.0) Gecko/20100101 Firefox/115.0", "retries": 4, "timeout": 30.0, "verify": true, @@ -108,8 +108,10 @@ }, "flickr": { - "videos": true, - "size-max": null + "exif": false, + "metadata": false, + "size-max": null, + "videos": true }, "furaffinity": { @@ -129,7 +131,7 @@ }, "gofile": { "api-token": null, - "website-token": "12345" + "website-token": null }, "hentaifoundry": { @@ -146,6 +148,9 @@ "password": null, "sleep-request": 5.0 }, + "imagechest": { + "access-token": null + }, "imgbb": { "username": null, @@ -166,6 +171,9 @@ "api": "rest", "cookies": null, "include": "posts", + "order-files": "asc", + "order-posts": "asc", + "previews": false, "sleep-request": [6.0, 12.0], "videos": true }, @@ -190,6 +198,7 @@ "password": null }, "misskey": { + "access-token": null, "renotes": false, "replies": true }, @@ -201,10 +210,6 @@ "format": "original", "include": "art" }, - "nana": - { - "favkey": null - }, "nijie": { "username": null, @@ -243,6 +248,7 @@ { "refresh-token": null, "include": "artworks", + "embeds": false, "metadata": false, "metadata-bookmark": false, "tags": "japanese", @@ -255,6 +261,9 @@ }, "reddit": { + "client-id": null, + "user-agent": null, + "refresh-token": null, "comments": 0, "morecomments": false, "date-min": 0, diff --git a/docs/options.md b/docs/options.md index 2df9788f..b0abcf85 100644 --- a/docs/options.md +++ b/docs/options.md @@ -18,12 +18,6 @@ --user-agent UA User-Agent request header --clear-cache MODULE Delete cached login sessions, cookies, etc. for MODULE (ALL to delete everything) - --cookies FILE File to load additional cookies from - --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] - Name of the browser to load cookies from, with - optional keyring name prefixed with '+', profile - prefixed with ':', and container prefixed with - '::' ('none' for no container) ## Output Options: -q, --quiet Activate quiet mode @@ -84,6 +78,16 @@ -p, --password PASS Password belonging to the given username --netrc Enable .netrc authentication data +## Cookie Options: + -C, --cookies FILE File to load additional cookies from + --cookies-export FILE Export session cookies to FILE + --cookies-from-browser BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER] + Name of the browser to load cookies from, with + optional domain prefixed with '/', keyring name + prefixed with '+', profile prefixed with ':', + and container prefixed with '::' ('none' for no + container) + ## Selection Options: --download-archive FILE Record all downloaded or skipped files in FILE and skip downloading any file already in it diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 5153a9dd..4608b6e4 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -32,14 +32,14 @@ Consider all sites to be NSFW unless otherwise known. - 420chan - https://420chan.org/ + 4chan + https://www.4chan.org/ Boards, Threads - 4chan - https://www.4chan.org/ + 4chanarchives + https://4chanarchives.com/ Boards, Threads @@ -111,7 +111,7 @@ Consider all sites to be NSFW unless otherwise known. Bunkr - https://bunkr.la/ + https://bunkrr.su/ Albums @@ -251,7 +251,7 @@ Consider all sites to be NSFW unless otherwise known. Gfycat https://gfycat.com/ Collections, individual Images, Search Results, User Profiles - + Supported Gofile @@ -394,7 +394,7 @@ Consider all sites to be NSFW unless otherwise known. imgur https://imgur.com/ - Albums, Favorites, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles + Albums, Favorites, Favorites Folders, Galleries, individual Images, Search Results, Subreddits, Tag Searches, User Profiles @@ -427,6 +427,18 @@ Consider all sites to be NSFW unless otherwise known. Galleries, individual Images + + itch.io + https://itch.io/ + Games + + + + JPG Fish + https://jpeg.pet/ + Albums, individual Images, User Profiles + + Keenspot http://www.keenspot.com/ @@ -451,6 +463,12 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga + + Lensdump + https://lensdump.com/ + Albums, individual Images + + Lexica https://lexica.art/ @@ -463,12 +481,6 @@ Consider all sites to be NSFW unless otherwise known. Galleries - - LINE BLOG - https://www.lineblog.me/ - Blogs, Posts - - livedoor Blog http://blog.livedoor.jp/ @@ -523,6 +535,12 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga + + MangaRead + https://mangaread.org/ + Chapters, Manga + + MangaSee https://mangasee123.com/ @@ -535,24 +553,12 @@ Consider all sites to be NSFW unless otherwise known. Albums, Channels Supported - - meme.museum - https://meme.museum/ - Posts, Tag Searches - - My Hentai Gallery https://myhentaigallery.com/ Galleries - - Nana - https://nana.my.id/ - Galleries, Favorites, Search Results - - Naver https://blog.naver.com/ @@ -652,7 +658,7 @@ Consider all sites to be NSFW unless otherwise known. Pixiv https://www.pixiv.net/ - Artworks, Avatars, Backgrounds, Favorites, Follows, pixiv.me Links, pixivision, Rankings, Search Results, Series, Sketch, User Profiles, individual Images + Artworks, Avatars, Backgrounds, Favorites, Follows, pixiv.me Links, Novels, Novel Bookmarks, Novel Series, pixivision, Rankings, Search Results, Series, Sketch, User Profiles, individual Images OAuth @@ -700,7 +706,7 @@ Consider all sites to be NSFW unless otherwise known. Postimg https://postimages.org/ - individual Images + Galleries, individual Images @@ -724,7 +730,7 @@ Consider all sites to be NSFW unless otherwise known. RedGIFs https://redgifs.com/ - Collections, individual Images, Search Results, User Profiles + Collections, individual Images, Niches, Search Results, User Profiles @@ -819,7 +825,7 @@ Consider all sites to be NSFW unless otherwise known. TCB Scans - https://onepiecechapters.com/ + https://tcbscans.com/ Chapters, Manga @@ -844,7 +850,7 @@ Consider all sites to be NSFW unless otherwise known. Tumblr https://www.tumblr.com/ - Likes, Posts, Tag Searches, User Profiles + Days, Likes, Posts, Tag Searches, User Profiles OAuth @@ -868,7 +874,7 @@ Consider all sites to be NSFW unless otherwise known. Twitter https://twitter.com/ - Avatars, Backgrounds, Bookmarks, Events, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets + Avatars, Backgrounds, Bookmarks, Events, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets, User Profiles Supported @@ -887,7 +893,7 @@ Consider all sites to be NSFW unless otherwise known. Vipergirls https://vipergirls.to/ Posts, Threads - + Supported Vipr @@ -989,7 +995,7 @@ Consider all sites to be NSFW unless otherwise known. Zerochan https://www.zerochan.net/ individual Images, Tag Searches - + Supported かべうち @@ -1003,12 +1009,6 @@ Consider all sites to be NSFW unless otherwise known. Posts, Tag Searches - - 半次元 - https://bcy.net/ - Posts, User Profiles - - Danbooru Instances @@ -1031,6 +1031,12 @@ Consider all sites to be NSFW unless otherwise known. Pools, Popular Images, Posts, Tag Searches Supported + + Booruvar + https://booru.borvar.art/ + Pools, Popular Images, Posts, Tag Searches + + e621 Instances @@ -1047,6 +1053,12 @@ Consider all sites to be NSFW unless otherwise known. Favorites, Pools, Popular Images, Posts, Tag Searches Supported + + e6AI + https://e6ai.net/ + Favorites, Pools, Popular Images, Posts, Tag Searches + + Gelbooru Beta 0.1.11 @@ -1076,8 +1088,8 @@ Consider all sites to be NSFW unless otherwise known. - /v/idyart - https://vidyart.booru.org/ + /v/idyart2 + https://vidyart2.booru.org/ Favorites, Posts, Tag Searches @@ -1116,6 +1128,16 @@ Consider all sites to be NSFW unless otherwise known. + + jschan Imageboards + + + 94chan + https://94chan.org/ + Boards, Threads + + + LynxChan Imageboards @@ -1144,19 +1166,19 @@ Consider all sites to be NSFW unless otherwise known. Misskey.io https://misskey.io/ - Images from Notes, User Profiles + Favorites, Images from Notes, User Profiles Lesbian.energy https://lesbian.energy/ - Images from Notes, User Profiles + Favorites, Images from Notes, User Profiles Sushi.ski https://sushi.ski/ - Images from Notes, User Profiles + Favorites, Images from Notes, User Profiles @@ -1266,6 +1288,40 @@ Consider all sites to be NSFW unless otherwise known. + + Shimmie2 Instances + + + meme.museum + https://meme.museum/ + Posts, Tag Searches + + + + Loudbooru + https://loudbooru.com/ + Posts, Tag Searches + + + + Giantessbooru + https://giantessbooru.com/ + Posts, Tag Searches + + + + Tentaclerape + https://tentaclerape.net/ + Posts, Tag Searches + + + + Cavemanon + https://booru.cavemanon.xyz/ + Posts, Tag Searches + + + szurubooru Instances @@ -1388,14 +1444,8 @@ Consider all sites to be NSFW unless otherwise known. - Rozen Arcana - https://archive.alice.al/ - Boards, Galleries, Search Results, Threads - - - - TokyoChronos - https://www.tokyochronos.net/ + Palanq + https://archive.palanq.win/ Boards, Galleries, Search Results, Threads @@ -1421,12 +1471,6 @@ Consider all sites to be NSFW unless otherwise known. Chapters, Manga - - Sense-Scans - https://sensescans.com/reader/ - Chapters, Manga - - Mastodon Instances diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index a430f131..1450e8f2 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -70,12 +70,14 @@ def main(): if args.cookies_from_browser: browser, _, profile = args.cookies_from_browser.partition(":") browser, _, keyring = browser.partition("+") + browser, _, domain = browser.partition("/") if profile.startswith(":"): container = profile[1:] profile = None else: profile, _, container = profile.partition("::") - config.set((), "cookies", (browser, profile, keyring, container)) + config.set((), "cookies", ( + browser, profile, keyring, container, domain)) if args.options_pp: config.set((), "postprocessor-options", args.options_pp) for opts in args.options: diff --git a/gallery_dl/config.py b/gallery_dl/config.py index d014293e..0b2aca80 100644 --- a/gallery_dl/config.py +++ b/gallery_dl/config.py @@ -102,7 +102,8 @@ def load(files=None, strict=False, load=util.json_loads): log.error(exc) sys.exit(1) except Exception as exc: - log.warning("Could not parse '%s': %s", path, exc) + log.error("%s when loading '%s': %s", + exc.__class__.__name__, path, exc) if strict: sys.exit(2) else: @@ -118,7 +119,7 @@ def clear(): _config.clear() -def get(path, key, default=None, *, conf=_config): +def get(path, key, default=None, conf=_config): """Get the value of property 'key' or a default value""" try: for p in path: @@ -128,7 +129,7 @@ def get(path, key, default=None, *, conf=_config): return default -def interpolate(path, key, default=None, *, conf=_config): +def interpolate(path, key, default=None, conf=_config): """Interpolate the value of 'key'""" if key in conf: return conf[key] @@ -142,7 +143,7 @@ def interpolate(path, key, default=None, *, conf=_config): return default -def interpolate_common(common, paths, key, default=None, *, conf=_config): +def interpolate_common(common, paths, key, default=None, conf=_config): """Interpolate the value of 'key' using multiple 'paths' along a 'common' ancestor """ @@ -174,7 +175,7 @@ def interpolate_common(common, paths, key, default=None, *, conf=_config): return default -def accumulate(path, key, *, conf=_config): +def accumulate(path, key, conf=_config): """Accumulate the values of 'key' along 'path'""" result = [] try: @@ -193,7 +194,7 @@ def accumulate(path, key, *, conf=_config): return result -def set(path, key, value, *, conf=_config): +def set(path, key, value, conf=_config): """Set the value of property 'key' for this session""" for p in path: try: @@ -203,7 +204,7 @@ def set(path, key, value, *, conf=_config): conf[key] = value -def setdefault(path, key, value, *, conf=_config): +def setdefault(path, key, value, conf=_config): """Set the value of property 'key' if it doesn't exist""" for p in path: try: @@ -213,7 +214,7 @@ def setdefault(path, key, value, *, conf=_config): return conf.setdefault(key, value) -def unset(path, key, *, conf=_config): +def unset(path, key, conf=_config): """Unset the value of property 'key'""" try: for p in path: diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 3d715a77..c5c5667b 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -20,7 +20,6 @@ import struct import subprocess import sys import tempfile -from datetime import datetime, timedelta, timezone from hashlib import pbkdf2_hmac from http.cookiejar import Cookie from . import aes, text, util @@ -34,19 +33,19 @@ logger = logging.getLogger("cookies") def load_cookies(cookiejar, browser_specification): - browser_name, profile, keyring, container = \ + browser_name, profile, keyring, container, domain = \ _parse_browser_specification(*browser_specification) if browser_name == "firefox": - load_cookies_firefox(cookiejar, profile, container) + load_cookies_firefox(cookiejar, profile, container, domain) elif browser_name == "safari": - load_cookies_safari(cookiejar, profile) + load_cookies_safari(cookiejar, profile, domain) elif browser_name in SUPPORTED_BROWSERS_CHROMIUM: - load_cookies_chrome(cookiejar, browser_name, profile, keyring) + load_cookies_chrome(cookiejar, browser_name, profile, keyring, domain) else: raise ValueError("unknown browser '{}'".format(browser_name)) -def load_cookies_firefox(cookiejar, profile=None, container=None): +def load_cookies_firefox(cookiejar, profile=None, container=None, domain=None): path, container_id = _firefox_cookies_database(profile, container) with DatabaseCopy(path) as db: @@ -60,6 +59,13 @@ def load_cookies_firefox(cookiejar, profile=None, container=None): sql += " WHERE originAttributes LIKE ? OR originAttributes LIKE ?" uid = "%userContextId={}".format(container_id) parameters = (uid, uid + "&%") + elif domain: + if domain[0] == ".": + sql += " WHERE host == ? OR host LIKE ?" + parameters = (domain[1:], "%" + domain) + else: + sql += " WHERE host == ? OR host == ?" + parameters = (domain, "." + domain) set_cookie = cookiejar.set_cookie for name, value, domain, path, secure, expires in db.execute( @@ -69,9 +75,10 @@ def load_cookies_firefox(cookiejar, profile=None, container=None): domain, bool(domain), domain.startswith("."), path, bool(path), secure, expires, False, None, None, {}, )) + _log_info("Extracted %s cookies from Firefox", len(cookiejar)) -def load_cookies_safari(cookiejar, profile=None): +def load_cookies_safari(cookiejar, profile=None, domain=None): """Ref.: https://github.com/libyal/dtformats/blob /main/documentation/Safari%20Cookies.asciidoc - This data appears to be out of date @@ -87,27 +94,40 @@ def load_cookies_safari(cookiejar, profile=None): _safari_parse_cookies_page(p.read_bytes(page_size), cookiejar) -def load_cookies_chrome(cookiejar, browser_name, profile, keyring): +def load_cookies_chrome(cookiejar, browser_name, profile=None, + keyring=None, domain=None): config = _get_chromium_based_browser_settings(browser_name) path = _chrome_cookies_database(profile, config) - logger.debug("Extracting cookies from %s", path) + _log_debug("Extracting cookies from %s", path) with DatabaseCopy(path) as db: db.text_factory = bytes decryptor = get_cookie_decryptor( - config["directory"], config["keyring"], keyring=keyring) + config["directory"], config["keyring"], keyring) + + if domain: + if domain[0] == ".": + condition = " WHERE host_key == ? OR host_key LIKE ?" + parameters = (domain[1:], "%" + domain) + else: + condition = " WHERE host_key == ? OR host_key == ?" + parameters = (domain, "." + domain) + else: + condition = "" + parameters = () try: rows = db.execute( "SELECT host_key, name, value, encrypted_value, path, " - "expires_utc, is_secure FROM cookies") + "expires_utc, is_secure FROM cookies" + condition, parameters) except sqlite3.OperationalError: rows = db.execute( "SELECT host_key, name, value, encrypted_value, path, " - "expires_utc, secure FROM cookies") + "expires_utc, secure FROM cookies" + condition, parameters) set_cookie = cookiejar.set_cookie - failed_cookies = unencrypted_cookies = 0 + failed_cookies = 0 + unencrypted_cookies = 0 for domain, name, value, enc_value, path, expires, secure in rows: @@ -135,11 +155,11 @@ def load_cookies_chrome(cookiejar, browser_name, profile, keyring): else: failed_message = "" - logger.info("Extracted %s cookies from %s%s", - len(cookiejar), browser_name, failed_message) - counts = decryptor.cookie_counts.copy() + _log_info("Extracted %s cookies from %s%s", + len(cookiejar), browser_name.capitalize(), failed_message) + counts = decryptor.cookie_counts counts["unencrypted"] = unencrypted_cookies - logger.debug("cookie version breakdown: %s", counts) + _log_debug("Cookie version breakdown: %s", counts) # -------------------------------------------------------------------- @@ -157,11 +177,11 @@ def _firefox_cookies_database(profile=None, container=None): if path is None: raise FileNotFoundError("Unable to find Firefox cookies database in " "{}".format(search_root)) - logger.debug("Extracting cookies from %s", path) + _log_debug("Extracting cookies from %s", path) if container == "none": container_id = False - logger.debug("Only loading cookies not belonging to any container") + _log_debug("Only loading cookies not belonging to any container") elif container: containers_path = os.path.join( @@ -171,8 +191,8 @@ def _firefox_cookies_database(profile=None, container=None): with open(containers_path) as file: identities = util.json_loads(file.read())["identities"] except OSError: - logger.error("Unable to read Firefox container database at %s", - containers_path) + _log_error("Unable to read Firefox container database at '%s'", + containers_path) raise except KeyError: identities = () @@ -183,10 +203,10 @@ def _firefox_cookies_database(profile=None, container=None): container_id = context["userContextId"] break else: - raise ValueError("Unable to find Firefox container {}".format( + raise ValueError("Unable to find Firefox container '{}'".format( container)) - logger.debug("Only loading cookies from container '%s' (ID %s)", - container, container_id) + _log_debug("Only loading cookies from container '%s' (ID %s)", + container, container_id) else: container_id = None @@ -209,7 +229,7 @@ def _safari_cookies_database(): path = os.path.expanduser("~/Library/Cookies/Cookies.binarycookies") return open(path, "rb") except FileNotFoundError: - logger.debug("Trying secondary cookie location") + _log_debug("Trying secondary cookie location") path = os.path.expanduser("~/Library/Containers/com.apple.Safari/Data" "/Library/Cookies/Cookies.binarycookies") return open(path, "rb") @@ -224,13 +244,13 @@ def _safari_parse_cookies_header(data): return page_sizes, p.cursor -def _safari_parse_cookies_page(data, jar): +def _safari_parse_cookies_page(data, cookiejar, domain=None): p = DataParser(data) p.expect_bytes(b"\x00\x00\x01\x00", "page signature") number_of_cookies = p.read_uint() record_offsets = [p.read_uint() for _ in range(number_of_cookies)] if number_of_cookies == 0: - logger.debug("a cookies page of size %s has no cookies", len(data)) + _log_debug("Cookies page of size %s has no cookies", len(data)) return p.skip_to(record_offsets[0], "unknown page header field") @@ -238,12 +258,12 @@ def _safari_parse_cookies_page(data, jar): for i, record_offset in enumerate(record_offsets): p.skip_to(record_offset, "space between records") record_length = _safari_parse_cookies_record( - data[record_offset:], jar) + data[record_offset:], cookiejar, domain) p.read_bytes(record_length) p.skip_to_end("space in between pages") -def _safari_parse_cookies_record(data, cookiejar): +def _safari_parse_cookies_record(data, cookiejar, host=None): p = DataParser(data) record_size = p.read_uint() p.skip(4, "unknown record field 1") @@ -262,6 +282,14 @@ def _safari_parse_cookies_record(data, cookiejar): p.skip_to(domain_offset) domain = p.read_cstring() + if host: + if host[0] == ".": + if host[1:] != domain and not domain.endswith(host): + return record_size + else: + if host != domain and ("." + host) != domain: + return record_size + p.skip_to(name_offset) name = p.read_cstring() @@ -271,8 +299,7 @@ def _safari_parse_cookies_record(data, cookiejar): p.skip_to(value_offset) value = p.read_cstring() except UnicodeDecodeError: - logger.warning("failed to parse Safari cookie " - "because UTF-8 decoding failed") + _log_warning("Failed to parse Safari cookie") return record_size p.skip_to(record_size, "space at the end of the record") @@ -300,7 +327,7 @@ def _chrome_cookies_database(profile, config): elif config["profiles"]: search_root = os.path.join(config["directory"], profile) else: - logger.warning("%s does not support profiles", config["browser"]) + _log_warning("%s does not support profiles", config["browser"]) search_root = config["directory"] path = _find_most_recently_used_file(search_root, "Cookies") @@ -412,18 +439,17 @@ class ChromeCookieDecryptor: raise NotImplementedError("Must be implemented by sub classes") -def get_cookie_decryptor(browser_root, browser_keyring_name, *, keyring=None): +def get_cookie_decryptor(browser_root, browser_keyring_name, keyring=None): if sys.platform in ("win32", "cygwin"): return WindowsChromeCookieDecryptor(browser_root) elif sys.platform == "darwin": return MacChromeCookieDecryptor(browser_keyring_name) else: - return LinuxChromeCookieDecryptor( - browser_keyring_name, keyring=keyring) + return LinuxChromeCookieDecryptor(browser_keyring_name, keyring) class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): - def __init__(self, browser_keyring_name, *, keyring=None): + def __init__(self, browser_keyring_name, keyring=None): self._v10_key = self.derive_key(b"peanuts") password = _get_linux_keyring_password(browser_keyring_name, keyring) self._v11_key = None if password is None else self.derive_key(password) @@ -452,7 +478,7 @@ class LinuxChromeCookieDecryptor(ChromeCookieDecryptor): elif version == b"v11": self._cookie_counts["v11"] += 1 if self._v11_key is None: - logger.warning("cannot decrypt v11 cookies: no key found") + _log_warning("Unable to decrypt v11 cookies: no key found") return None return _decrypt_aes_cbc(ciphertext, self._v11_key) @@ -486,7 +512,7 @@ class MacChromeCookieDecryptor(ChromeCookieDecryptor): if version == b"v10": self._cookie_counts["v10"] += 1 if self._v10_key is None: - logger.warning("cannot decrypt v10 cookies: no key found") + _log_warning("Unable to decrypt v10 cookies: no key found") return None return _decrypt_aes_cbc(ciphertext, self._v10_key) @@ -516,7 +542,7 @@ class WindowsChromeCookieDecryptor(ChromeCookieDecryptor): if version == b"v10": self._cookie_counts["v10"] += 1 if self._v10_key is None: - logger.warning("cannot decrypt v10 cookies: no key found") + _log_warning("Unable to decrypt v10 cookies: no key found") return None # https://chromium.googlesource.com/chromium/src/+/refs/heads @@ -554,7 +580,7 @@ def _choose_linux_keyring(): SelectBackend """ desktop_environment = _get_linux_desktop_environment(os.environ) - logger.debug("Detected desktop environment: %s", desktop_environment) + _log_debug("Detected desktop environment: %s", desktop_environment) if desktop_environment == DE_KDE: return KEYRING_KWALLET if desktop_environment == DE_OTHER: @@ -582,23 +608,23 @@ def _get_kwallet_network_wallet(): ) if proc.returncode != 0: - logger.warning("failed to read NetworkWallet") + _log_warning("Failed to read NetworkWallet") return default_wallet else: network_wallet = stdout.decode().strip() - logger.debug("NetworkWallet = '%s'", network_wallet) + _log_debug("NetworkWallet = '%s'", network_wallet) return network_wallet except Exception as exc: - logger.warning("exception while obtaining NetworkWallet (%s: %s)", - exc.__class__.__name__, exc) + _log_warning("Error while obtaining NetworkWallet (%s: %s)", + exc.__class__.__name__, exc) return default_wallet def _get_kwallet_password(browser_keyring_name): - logger.debug("using kwallet-query to obtain password from kwallet") + _log_debug("Using kwallet-query to obtain password from kwallet") if shutil.which("kwallet-query") is None: - logger.error( + _log_error( "kwallet-query command not found. KWallet and kwallet-query " "must be installed to read from KWallet. kwallet-query should be " "included in the kwallet package for your distribution") @@ -615,14 +641,14 @@ def _get_kwallet_password(browser_keyring_name): ) if proc.returncode != 0: - logger.error("kwallet-query failed with return code {}. " - "Please consult the kwallet-query man page " - "for details".format(proc.returncode)) + _log_error("kwallet-query failed with return code {}. " + "Please consult the kwallet-query man page " + "for details".format(proc.returncode)) return b"" if stdout.lower().startswith(b"failed to read"): - logger.debug("Failed to read password from kwallet. " - "Using empty string instead") + _log_debug("Failed to read password from kwallet. " + "Using empty string instead") # This sometimes occurs in KDE because chrome does not check # hasEntry and instead just tries to read the value (which # kwallet returns "") whereas kwallet-query checks hasEntry. @@ -633,13 +659,12 @@ def _get_kwallet_password(browser_keyring_name): # random password and store it, but that doesn't matter here. return b"" else: - logger.debug("password found") if stdout[-1:] == b"\n": stdout = stdout[:-1] return stdout except Exception as exc: - logger.warning("exception running kwallet-query (%s: %s)", - exc.__class__.__name__, exc) + _log_warning("Error when running kwallet-query (%s: %s)", + exc.__class__.__name__, exc) return b"" @@ -647,7 +672,7 @@ def _get_gnome_keyring_password(browser_keyring_name): try: import secretstorage except ImportError: - logger.error("secretstorage not available") + _log_error("'secretstorage' Python package not available") return b"" # Gnome keyring does not seem to organise keys in the same way as KWallet, @@ -662,7 +687,7 @@ def _get_gnome_keyring_password(browser_keyring_name): if item.get_label() == label: return item.get_secret() else: - logger.error("failed to read from keyring") + _log_error("Failed to read from GNOME keyring") return b"" @@ -676,7 +701,7 @@ def _get_linux_keyring_password(browser_keyring_name, keyring): if not keyring: keyring = _choose_linux_keyring() - logger.debug("Chosen keyring: %s", keyring) + _log_debug("Chosen keyring: %s", keyring) if keyring == KEYRING_KWALLET: return _get_kwallet_password(browser_keyring_name) @@ -690,8 +715,8 @@ def _get_linux_keyring_password(browser_keyring_name, keyring): def _get_mac_keyring_password(browser_keyring_name): - logger.debug("using find-generic-password to obtain " - "password from OSX keychain") + _log_debug("Using find-generic-password to obtain " + "password from OSX keychain") try: proc, stdout = Popen_communicate( "security", "find-generic-password", @@ -704,28 +729,28 @@ def _get_mac_keyring_password(browser_keyring_name): stdout = stdout[:-1] return stdout except Exception as exc: - logger.warning("exception running find-generic-password (%s: %s)", - exc.__class__.__name__, exc) + _log_warning("Error when using find-generic-password (%s: %s)", + exc.__class__.__name__, exc) return None def _get_windows_v10_key(browser_root): path = _find_most_recently_used_file(browser_root, "Local State") if path is None: - logger.error("could not find local state file") + _log_error("Unable to find Local State file") return None - logger.debug("Found local state file at '%s'", path) + _log_debug("Found Local State file at '%s'", path) with open(path, encoding="utf-8") as file: data = util.json_loads(file.read()) try: base64_key = data["os_crypt"]["encrypted_key"] except KeyError: - logger.error("no encrypted key in Local State") + _log_error("Unable to find encrypted key in Local State") return None encrypted_key = binascii.a2b_base64(base64_key) prefix = b"DPAPI" if not encrypted_key.startswith(prefix): - logger.error("invalid key") + _log_error("Invalid Local State key") return None return _decrypt_windows_dpapi(encrypted_key[len(prefix):]) @@ -777,10 +802,10 @@ class DataParser: def skip(self, num_bytes, description="unknown"): if num_bytes > 0: - logger.debug("skipping {} bytes ({}): {!r}".format( + _log_debug("Skipping {} bytes ({}): {!r}".format( num_bytes, description, self.read_bytes(num_bytes))) elif num_bytes < 0: - raise ParserError("invalid skip of {} bytes".format(num_bytes)) + raise ParserError("Invalid skip of {} bytes".format(num_bytes)) def skip_to(self, offset, description="unknown"): self.skip(offset - self.cursor, description) @@ -893,8 +918,8 @@ def _get_linux_desktop_environment(env): def _mac_absolute_time_to_posix(timestamp): - return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + - timedelta(seconds=timestamp)).timestamp()) + # 978307200 is timestamp of 2001-01-01 00:00:00 + return 978307200 + int(timestamp) def pbkdf2_sha1(password, salt, iterations, key_length): @@ -902,31 +927,25 @@ def pbkdf2_sha1(password, salt, iterations, key_length): def _decrypt_aes_cbc(ciphertext, key, initialization_vector=b" " * 16): - plaintext = aes.unpad_pkcs7( - aes.aes_cbc_decrypt_bytes(ciphertext, key, initialization_vector)) try: - return plaintext.decode() + return aes.unpad_pkcs7(aes.aes_cbc_decrypt_bytes( + ciphertext, key, initialization_vector)).decode() except UnicodeDecodeError: - logger.warning("failed to decrypt cookie (AES-CBC) because UTF-8 " - "decoding failed. Possibly the key is wrong?") - return None + _log_warning("Failed to decrypt cookie (AES-CBC Unicode)") + except ValueError: + _log_warning("Failed to decrypt cookie (AES-CBC)") + return None def _decrypt_aes_gcm(ciphertext, key, nonce, authentication_tag): try: - plaintext = aes.aes_gcm_decrypt_and_verify_bytes( - ciphertext, key, authentication_tag, nonce) - except ValueError: - logger.warning("failed to decrypt cookie (AES-GCM) because MAC check " - "failed. Possibly the key is wrong?") - return None - - try: - return plaintext.decode() + return aes.aes_gcm_decrypt_and_verify_bytes( + ciphertext, key, authentication_tag, nonce).decode() except UnicodeDecodeError: - logger.warning("failed to decrypt cookie (AES-GCM) because UTF-8 " - "decoding failed. Possibly the key is wrong?") - return None + _log_warning("Failed to decrypt cookie (AES-GCM Unicode)") + except ValueError: + _log_warning("Failed to decrypt cookie (AES-GCM MAC)") + return None def _decrypt_windows_dpapi(ciphertext): @@ -954,7 +973,7 @@ def _decrypt_windows_dpapi(ciphertext): ctypes.byref(blob_out) # pDataOut ) if not ret: - logger.warning("failed to decrypt with DPAPI") + _log_warning("Failed to decrypt cookie (DPAPI)") return None result = ctypes.string_at(blob_out.pbData, blob_out.cbData) @@ -979,12 +998,29 @@ def _is_path(value): def _parse_browser_specification( - browser, profile=None, keyring=None, container=None): + browser, profile=None, keyring=None, container=None, domain=None): browser = browser.lower() if browser not in SUPPORTED_BROWSERS: - raise ValueError("unsupported browser '{}'".format(browser)) + raise ValueError("Unsupported browser '{}'".format(browser)) if keyring and keyring not in SUPPORTED_KEYRINGS: - raise ValueError("unsupported keyring '{}'".format(keyring)) + raise ValueError("Unsupported keyring '{}'".format(keyring)) if profile and _is_path(profile): profile = os.path.expanduser(profile) - return browser, profile, keyring, container + return browser, profile, keyring, container, domain + + +_log_cache = set() +_log_debug = logger.debug +_log_info = logger.info + + +def _log_warning(msg, *args): + if msg not in _log_cache: + _log_cache.add(msg) + logger.warning(msg, *args) + + +def _log_error(msg, *args): + if msg not in _log_cache: + _log_cache.add(msg) + logger.error(msg, *args) diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index 88e86e9c..4ec03983 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -44,6 +44,12 @@ class HttpDownloader(DownloaderBase): self.mtime = self.config("mtime", True) self.rate = self.config("rate") + if not self.config("consume-content", False): + # this resets the underlying TCP connection, and therefore + # if the program makes another request to the same domain, + # a new connection (either TLS or plain TCP) must be made + self.release_conn = lambda resp: resp.close() + if self.retries < 0: self.retries = float("inf") if self.minsize: @@ -106,7 +112,7 @@ class HttpDownloader(DownloaderBase): while True: if tries: if response: - response.close() + self.release_conn(response) response = None self.log.warning("%s (%s/%s)", msg, tries, self.retries+1) if tries > self.retries: @@ -165,18 +171,24 @@ class HttpDownloader(DownloaderBase): retry = kwdict.get("_http_retry") if retry and retry(response): continue + self.release_conn(response) self.log.warning(msg) return False # check for invalid responses validate = kwdict.get("_http_validate") if validate and self.validate: - result = validate(response) + try: + result = validate(response) + except Exception: + self.release_conn(response) + raise if isinstance(result, str): url = result tries -= 1 continue if not result: + self.release_conn(response) self.log.warning("Invalid response") return False @@ -184,11 +196,13 @@ class HttpDownloader(DownloaderBase): size = text.parse_int(size, None) if size is not None: if self.minsize and size < self.minsize: + self.release_conn(response) self.log.warning( "File size smaller than allowed minimum (%s < %s)", size, self.minsize) return False if self.maxsize and size > self.maxsize: + self.release_conn(response) self.log.warning( "File size larger than allowed maximum (%s > %s)", size, self.maxsize) @@ -280,6 +294,18 @@ class HttpDownloader(DownloaderBase): return True + def release_conn(self, response): + """Release connection back to pool by consuming response body""" + try: + for _ in response.iter_content(self.chunk_size): + pass + except (RequestException, SSLError, OpenSSLError) as exc: + print() + self.log.debug( + "Unable to consume response body (%s: %s); " + "closing the connection anyway", exc.__class__.__name__, exc) + response.close() + @staticmethod def receive(fp, content, bytes_total, bytes_start): write = fp.write diff --git a/gallery_dl/extractor/3dbooru.py b/gallery_dl/extractor/3dbooru.py index e0066cb9..e83bca75 100644 --- a/gallery_dl/extractor/3dbooru.py +++ b/gallery_dl/extractor/3dbooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2020 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -17,12 +17,10 @@ class _3dbooruBase(): basecategory = "booru" root = "http://behoimi.org" - def __init__(self, match): - super().__init__(match) - self.session.headers.update({ - "Referer": "http://behoimi.org/post/show/", - "Accept-Encoding": "identity", - }) + def _init(self): + headers = self.session.headers + headers["Referer"] = "http://behoimi.org/post/show/" + headers["Accept-Encoding"] = "identity" class _3dbooruTagExtractor(_3dbooruBase, moebooru.MoebooruTagExtractor): diff --git a/gallery_dl/extractor/420chan.py b/gallery_dl/extractor/420chan.py deleted file mode 100644 index fd0172e0..00000000 --- a/gallery_dl/extractor/420chan.py +++ /dev/null @@ -1,76 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2021 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://420chan.org/""" - -from .common import Extractor, Message - - -class _420chanThreadExtractor(Extractor): - """Extractor for 420chan threads""" - category = "420chan" - subcategory = "thread" - directory_fmt = ("{category}", "{board}", "{thread} {title}") - archive_fmt = "{board}_{thread}_{filename}" - pattern = r"(?:https?://)?boards\.420chan\.org/([^/?#]+)/thread/(\d+)" - test = ("https://boards.420chan.org/ani/thread/33251/chow-chows", { - "pattern": r"https://boards\.420chan\.org/ani/src/\d+\.jpg", - "content": "b07c803b0da78de159709da923e54e883c100934", - "count": 2, - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board, self.thread = match.groups() - - def items(self): - url = "https://api.420chan.org/{}/res/{}.json".format( - self.board, self.thread) - posts = self.request(url).json()["posts"] - - data = { - "board" : self.board, - "thread": self.thread, - "title" : posts[0].get("sub") or posts[0]["com"][:50], - } - - yield Message.Directory, data - for post in posts: - if "filename" in post: - post.update(data) - post["extension"] = post["ext"][1:] - url = "https://boards.420chan.org/{}/src/{}{}".format( - post["board"], post["filename"], post["ext"]) - yield Message.Url, url, post - - -class _420chanBoardExtractor(Extractor): - """Extractor for 420chan boards""" - category = "420chan" - subcategory = "board" - pattern = r"(?:https?://)?boards\.420chan\.org/([^/?#]+)/\d*$" - test = ("https://boards.420chan.org/po/", { - "pattern": _420chanThreadExtractor.pattern, - "count": ">= 100", - }) - - def __init__(self, match): - Extractor.__init__(self, match) - self.board = match.group(1) - - def items(self): - url = "https://api.420chan.org/{}/threads.json".format(self.board) - threads = self.request(url).json() - - for page in threads: - for thread in page["threads"]: - url = "https://boards.420chan.org/{}/thread/{}/".format( - self.board, thread["no"]) - thread["page"] = page["page"] - thread["_extractor"] = _420chanThreadExtractor - yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/4chanarchives.py b/gallery_dl/extractor/4chanarchives.py new file mode 100644 index 00000000..041e6a35 --- /dev/null +++ b/gallery_dl/extractor/4chanarchives.py @@ -0,0 +1,139 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://4chanarchives.com/""" + +from .common import Extractor, Message +from .. import text + + +class _4chanarchivesThreadExtractor(Extractor): + """Extractor for threads on 4chanarchives.com""" + category = "4chanarchives" + subcategory = "thread" + root = "https://4chanarchives.com" + directory_fmt = ("{category}", "{board}", "{thread} - {title}") + filename_fmt = "{no}-{filename}.{extension}" + archive_fmt = "{board}_{thread}_{no}" + pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)/thread/(\d+)" + test = ( + ("https://4chanarchives.com/board/c/thread/2707110", { + "pattern": r"https://i\.imgur\.com/(0wLGseE|qbByWDc)\.jpg", + "count": 2, + "keyword": { + "board": "c", + "com": str, + "name": "Anonymous", + "no": int, + "thread": "2707110", + "time": r"re:2016-07-1\d \d\d:\d\d:\d\d", + "title": "Ren Kagami from 'Oyako Neburi'", + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.thread = match.groups() + + def items(self): + url = "{}/board/{}/thread/{}".format( + self.root, self.board, self.thread) + page = self.request(url).text + data = self.metadata(page) + posts = self.posts(page) + + if not data["title"]: + data["title"] = text.unescape(text.remove_html( + posts[0]["com"]))[:50] + + for post in posts: + post.update(data) + yield Message.Directory, post + if "url" in post: + yield Message.Url, post["url"], post + + def metadata(self, page): + return { + "board" : self.board, + "thread" : self.thread, + "title" : text.unescape(text.extr( + page, 'property="og:title" content="', '"')), + } + + def posts(self, page): + """Build a list of all post objects""" + return [self.parse(html) for html in text.extract_iter( + page, 'id="pc', '')] + + def parse(self, html): + """Build post object by extracting data from an HTML post""" + post = self._extract_post(html) + if ">File: <" in html: + self._extract_file(html, post) + post["extension"] = post["url"].rpartition(".")[2] + return post + + @staticmethod + def _extract_post(html): + extr = text.extract_from(html) + return { + "no" : text.parse_int(extr('', '"')), + "name": extr('class="name">', '<'), + "time": extr('class="dateTime postNum" >', '<').rstrip(), + "com" : text.unescape( + html[html.find('")[2]), + } + + @staticmethod + def _extract_file(html, post): + extr = text.extract_from(html, html.index(">File: <")) + post["url"] = extr('href="', '"') + post["filename"] = text.unquote(extr(">", "<").rpartition(".")[0]) + post["fsize"] = extr("(", ", ") + post["w"] = text.parse_int(extr("", "x")) + post["h"] = text.parse_int(extr("", ")")) + + +class _4chanarchivesBoardExtractor(Extractor): + """Extractor for boards on 4chanarchives.com""" + category = "4chanarchives" + subcategory = "board" + root = "https://4chanarchives.com" + pattern = r"(?:https?://)?4chanarchives\.com/board/([^/?#]+)(?:/(\d+))?/?$" + test = ( + ("https://4chanarchives.com/board/c/", { + "pattern": _4chanarchivesThreadExtractor.pattern, + "range": "1-40", + "count": 40, + }), + ("https://4chanarchives.com/board/c"), + ("https://4chanarchives.com/board/c/10"), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.board, self.page = match.groups() + + def items(self): + data = {"_extractor": _4chanarchivesThreadExtractor} + pnum = text.parse_int(self.page, 1) + needle = ''' + = 5", }) - def metadata(self): + def _init(self): self.params = text.parse_query(self.user) self.user = None + + def metadata(self): return {"search_tags": self.params.get("tag")} def posts(self): diff --git a/gallery_dl/extractor/bcy.py b/gallery_dl/extractor/bcy.py deleted file mode 100644 index d6adb4eb..00000000 --- a/gallery_dl/extractor/bcy.py +++ /dev/null @@ -1,206 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2020-2023 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://bcy.net/""" - -from .common import Extractor, Message -from .. import text, util, exception -import re - - -class BcyExtractor(Extractor): - """Base class for bcy extractors""" - category = "bcy" - directory_fmt = ("{category}", "{user[id]} {user[name]}") - filename_fmt = "{post[id]} {id}.{extension}" - archive_fmt = "{post[id]}_{id}" - root = "https://bcy.net" - - def __init__(self, match): - Extractor.__init__(self, match) - self.item_id = match.group(1) - self.session.headers["Referer"] = self.root + "/" - - def items(self): - sub = re.compile(r"^https?://p\d+-bcy" - r"(?:-sign\.bcyimg\.com|\.byteimg\.com/img)" - r"/banciyuan").sub - iroot = "https://img-bcy-qn.pstatp.com" - noop = self.config("noop") - - for post in self.posts(): - if not post["image_list"]: - continue - - multi = None - tags = post.get("post_tags") or () - data = { - "user": { - "id" : post["uid"], - "name" : post["uname"], - "avatar" : sub(iroot, post["avatar"].partition("~")[0]), - }, - "post": { - "id" : text.parse_int(post["item_id"]), - "tags" : [t["tag_name"] for t in tags], - "date" : text.parse_timestamp(post["ctime"]), - "parody" : post["work"], - "content": post["plain"], - "likes" : post["like_count"], - "shares" : post["share_count"], - "replies": post["reply_count"], - }, - } - - yield Message.Directory, data - for data["num"], image in enumerate(post["image_list"], 1): - data["id"] = image["mid"] - data["width"] = image["w"] - data["height"] = image["h"] - - url = image["path"].partition("~")[0] - text.nameext_from_url(url, data) - - # full-resolution image without watermark - if data["extension"]: - if not url.startswith(iroot): - url = sub(iroot, url) - data["filter"] = "" - yield Message.Url, url, data - - # watermarked image & low quality noop filter - else: - if multi is None: - multi = self._data_from_post( - post["item_id"])["post_data"]["multi"] - image = multi[data["num"] - 1] - - if image["origin"]: - data["filter"] = "watermark" - yield Message.Url, image["origin"], data - - if noop: - data["extension"] = "" - data["filter"] = "noop" - yield Message.Url, image["original_path"], data - - def posts(self): - """Returns an iterable with all relevant 'post' objects""" - - def _data_from_post(self, post_id): - url = "{}/item/detail/{}".format(self.root, post_id) - page = self.request(url, notfound="post").text - data = (text.extr(page, 'JSON.parse("', '");') - .replace('\\\\u002F', '/') - .replace('\\"', '"')) - try: - return util.json_loads(data)["detail"] - except ValueError: - return util.json_loads(data.replace('\\"', '"'))["detail"] - - -class BcyUserExtractor(BcyExtractor): - """Extractor for user timelines""" - subcategory = "user" - pattern = r"(?:https?://)?bcy\.net/u/(\d+)" - test = ( - ("https://bcy.net/u/1933712", { - "pattern": r"https://img-bcy-qn.pstatp.com/\w+/\d+/post/\w+/.+jpg", - "count": ">= 20", - }), - ("https://bcy.net/u/109282764041", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "range": "1-25", - "count": 25, - }), - ) - - def posts(self): - url = self.root + "/apiv3/user/selfPosts" - params = {"uid": self.item_id, "since": None} - - while True: - data = self.request(url, params=params).json() - - try: - items = data["data"]["items"] - except KeyError: - return - if not items: - return - - for item in items: - yield item["item_detail"] - params["since"] = item["since"] - - -class BcyPostExtractor(BcyExtractor): - """Extractor for individual posts""" - subcategory = "post" - pattern = r"(?:https?://)?bcy\.net/item/detail/(\d+)" - test = ( - ("https://bcy.net/item/detail/6355835481002893070", { - "url": "301202375e61fd6e0e2e35de6c3ac9f74885dec3", - "count": 1, - "keyword": { - "user": { - "id" : 1933712, - "name" : "wukloo", - "avatar" : "re:https://img-bcy-qn.pstatp.com/Public/", - }, - "post": { - "id" : 6355835481002893070, - "tags" : list, - "date" : "dt:2016-11-22 08:47:46", - "parody" : "东方PROJECT", - "content": "re:根据微博的建议稍微做了点修改", - "likes" : int, - "shares" : int, - "replies": int, - }, - "id": 8330182, - "num": 1, - "width" : 3000, - "height": 1687, - "filename": "712e0780b09011e696f973c3d1568337", - "extension": "jpg", - }, - }), - # only watermarked images available - ("https://bcy.net/item/detail/6950136331708144648", { - "pattern": r"https://p\d-bcy-sign\.bcyimg\.com/banciyuan/[0-9a-f]+" - r"~tplv-bcyx-yuan-logo-v1:.+\.image", - "count": 10, - "keyword": {"filter": "watermark"}, - }), - # deleted - ("https://bcy.net/item/detail/6780546160802143237", { - "exception": exception.NotFoundError, - "count": 0, - }), - # only visible to logged in users - ("https://bcy.net/item/detail/6747523535150783495", { - "count": 0, - }), - # JSON decode error (#3321) - ("https://bcy.net/item/detail/7166939271872388110", { - "count": 0, - }), - ) - - def posts(self): - try: - data = self._data_from_post(self.item_id) - except KeyError: - return () - post = data["post_data"] - post["image_list"] = post["multi"] - post["plain"] = text.parse_unicode_escapes(post["plain"]) - post.update(data["detail_user"]) - return (post,) diff --git a/gallery_dl/extractor/behance.py b/gallery_dl/extractor/behance.py index 1469aad9..d8cc51d3 100644 --- a/gallery_dl/extractor/behance.py +++ b/gallery_dl/extractor/behance.py @@ -81,10 +81,13 @@ class BehanceGalleryExtractor(BehanceExtractor): ("https://www.behance.net/gallery/88276087/Audi-R8-RWD", { "count": 20, "url": "6bebff0d37f85349f9ad28bd8b76fd66627c1e2f", + "pattern": r"https://mir-s3-cdn-cf\.behance\.net/project_modules" + r"/source/[0-9a-f]+.[0-9a-f]+\.jpg" }), # 'video' modules (#1282) ("https://www.behance.net/gallery/101185577/COLCCI", { - "pattern": r"ytdl:https://cdn-prod-ccv\.adobe\.com/", + "pattern": r"https://cdn-prod-ccv\.adobe\.com/\w+" + r"/rend/\w+_720\.mp4\?", "count": 3, }), ) @@ -129,26 +132,35 @@ class BehanceGalleryExtractor(BehanceExtractor): append = result.append for module in data["modules"]: - mtype = module["type"] + mtype = module["__typename"] - if mtype == "image": - url = module["sizes"]["original"] + if mtype == "ImageModule": + url = module["imageSizes"]["size_original"]["url"] append((url, module)) - elif mtype == "video": - page = self.request(module["src"]).text - url = text.extr(page, '-QjgneIQv\.png", "content": "0c8768055e4e20e7c7259608b67799171b691140", "keyword": { @@ -52,6 +52,12 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "num": int, }, }), + # cdn12 .ru TLD (#4147) + ("https://bunkrr.su/a/j1G29CnD", { + "pattern": r"https://(cdn12.bunkr.ru|media-files12.bunkr.la)/\w+", + "count": 8, + }), + ("https://bunkrr.su/a/Lktg9Keq"), ("https://bunkr.la/a/Lktg9Keq"), ("https://bunkr.su/a/Lktg9Keq"), ("https://bunkr.ru/a/Lktg9Keq"), @@ -70,7 +76,7 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): cdn = None files = [] append = files.append - headers = {"Referer": self.root.replace("://", "://stream.", 1) + "/"} + headers = {"Referer": self.root + "/"} pos = page.index('class="grid-images') for url in text.extract_iter(page, '= 3", + }), + ("https://booru.borvar.art/posts?tags=chibi&z=1", { + "pattern": r"https://booru\.borvar\.art/data/original" r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w+", "count": ">= 3", }), @@ -200,7 +215,21 @@ class DanbooruTagExtractor(DanbooruExtractor): return {"search_tags": self.tags} def posts(self): - return self._pagination("/posts.json", {"tags": self.tags}) + prefix = "b" + for tag in self.tags.split(): + if tag.startswith("order:"): + if tag == "order:id" or tag == "order:id_asc": + prefix = "a" + elif tag == "order:id_desc": + prefix = "b" + else: + prefix = None + elif tag.startswith( + ("id:", "md5", "ordfav:", "ordfavgroup:", "ordpool:")): + prefix = None + break + + return self._pagination("/posts.json", {"tags": self.tags}, prefix) class DanbooruPoolExtractor(DanbooruExtractor): @@ -217,6 +246,10 @@ class DanbooruPoolExtractor(DanbooruExtractor): "url": "902549ffcdb00fe033c3f63e12bc3cb95c5fd8d5", "count": 6, }), + ("https://booru.borvar.art/pools/2", { + "url": "77fa3559a3fc919f72611f4e3dd0f919d19d3e0d", + "count": 4, + }), ("https://aibooru.online/pools/1"), ("https://danbooru.donmai.us/pool/show/7659"), ) @@ -234,7 +267,7 @@ class DanbooruPoolExtractor(DanbooruExtractor): def posts(self): params = {"tags": "pool:" + self.pool_id} - return self._pagination("/posts.json", params) + return self._pagination("/posts.json", params, "b") class DanbooruPostExtractor(DanbooruExtractor): @@ -245,6 +278,7 @@ class DanbooruPostExtractor(DanbooruExtractor): test = ( ("https://danbooru.donmai.us/posts/294929", { "content": "5e255713cbf0a8e0801dc423563c34d896bb9229", + "keyword": {"date": "dt:2008-08-12 04:46:05"}, }), ("https://danbooru.donmai.us/posts/3613024", { "pattern": r"https?://.+\.zip$", @@ -256,6 +290,9 @@ class DanbooruPostExtractor(DanbooruExtractor): ("https://aibooru.online/posts/1", { "content": "54d548743cd67799a62c77cbae97cfa0fec1b7e9", }), + ("https://booru.borvar.art/posts/1487", { + "content": "91273ac1ea413a12be468841e2b5804656a50bff", + }), ("https://danbooru.donmai.us/post/show/294929"), ) @@ -287,6 +324,7 @@ class DanbooruPopularExtractor(DanbooruExtractor): }), ("https://booru.allthefallen.moe/explore/posts/popular"), ("https://aibooru.online/explore/posts/popular"), + ("https://booru.borvar.art/explore/posts/popular"), ) def __init__(self, match): @@ -307,7 +345,4 @@ class DanbooruPopularExtractor(DanbooruExtractor): return {"date": date, "scale": scale} def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination( - "/explore/posts/popular.json", self.params, True) + return self._pagination("/explore/posts/popular.json", self.params) diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index f532a976..3497b0c4 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -32,20 +32,24 @@ class DeviantartExtractor(Extractor): root = "https://www.deviantart.com" directory_fmt = ("{category}", "{username}") filename_fmt = "{category}_{index}_{title}.{extension}" - cookiedomain = None - cookienames = ("auth", "auth_secure", "userinfo") + cookies_domain = None + cookies_names = ("auth", "auth_secure", "userinfo") _last_request = 0 def __init__(self, match): Extractor.__init__(self, match) + self.user = match.group(1) or match.group(2) + + def _init(self): self.flat = self.config("flat", True) self.extra = self.config("extra", False) self.original = self.config("original", True) self.comments = self.config("comments", False) - self.user = match.group(1) or match.group(2) + + self.api = DeviantartOAuthAPI(self) self.group = False self.offset = 0 - self.api = None + self._premium_cache = {} unwatch = self.config("auto-unwatch") if unwatch: @@ -60,27 +64,28 @@ class DeviantartExtractor(Extractor): self._update_content = self._update_content_image self.original = True - self._premium_cache = {} - self.commit_journal = { - "html": self._commit_journal_html, - "text": self._commit_journal_text, - }.get(self.config("journals", "html")) + journals = self.config("journals", "html") + if journals == "html": + self.commit_journal = self._commit_journal_html + elif journals == "text": + self.commit_journal = self._commit_journal_text + else: + self.commit_journal = None def skip(self, num): self.offset += num return num def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if not username: - return False - self._update_cookies(_login_impl(self, username, password)) - return True + if self.cookies_check(self.cookies_names): + return True + + username, password = self._get_auth_info() + if username: + self.cookies_update(_login_impl(self, username, password)) + return True def items(self): - self.api = DeviantartOAuthAPI(self) - if self.user and self.config("group", True): profile = self.api.user_profile(self.user) self.group = not profile @@ -448,6 +453,9 @@ class DeviantartUserExtractor(DeviantartExtractor): ("https://shimoda7.deviantart.com/"), ) + def initialize(self): + pass + def items(self): base = "{}/{}/".format(self.root, self.user) return self._dispatch_extractors(( @@ -1105,11 +1113,14 @@ class DeviantartDeviationExtractor(DeviantartExtractor): match.group(4) or match.group(5) or id_from_base36(match.group(6)) def deviations(self): - url = "{}/{}/{}/{}".format( - self.root, self.user or "u", self.type or "art", self.deviation_id) + if self.user: + url = "{}/{}/{}/{}".format( + self.root, self.user, self.type or "art", self.deviation_id) + else: + url = "{}/view/{}/".format(self.root, self.deviation_id) - uuid = text.extract(self._limited_request(url).text, - '"deviationUuid\\":\\"', '\\')[0] + uuid = text.extr(self._limited_request(url).text, + '"deviationUuid\\":\\"', '\\') if not uuid: raise exception.NotFoundError("deviation") return (self.api.deviation(uuid),) @@ -1120,7 +1131,7 @@ class DeviantartScrapsExtractor(DeviantartExtractor): subcategory = "scraps" directory_fmt = ("{category}", "{username}", "Scraps") archive_fmt = "s_{_username}_{index}.{extension}" - cookiedomain = ".deviantart.com" + cookies_domain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/(?:\?catpath=)?scraps\b" test = ( ("https://www.deviantart.com/shimoda7/gallery/scraps", { @@ -1143,7 +1154,7 @@ class DeviantartSearchExtractor(DeviantartExtractor): subcategory = "search" directory_fmt = ("{category}", "Search", "{search_tags}") archive_fmt = "Q_{search_tags}_{index}.{extension}" - cookiedomain = ".deviantart.com" + cookies_domain = ".deviantart.com" pattern = (r"(?:https?://)?www\.deviantart\.com" r"/search(?:/deviations)?/?\?([^#]+)") test = ( @@ -1202,7 +1213,7 @@ class DeviantartGallerySearchExtractor(DeviantartExtractor): """Extractor for deviantart gallery searches""" subcategory = "gallery-search" archive_fmt = "g_{_username}_{index}.{extension}" - cookiedomain = ".deviantart.com" + cookies_domain = ".deviantart.com" pattern = BASE_PATTERN + r"/gallery/?\?(q=[^#]+)" test = ( ("https://www.deviantart.com/shimoda7/gallery?q=memory", { @@ -1417,7 +1428,14 @@ class DeviantartOAuthAPI(): """Get the original file download (if allowed)""" endpoint = "/deviation/download/" + deviation_id params = {"mature_content": self.mature} - return self._call(endpoint, params=params, public=public) + + try: + return self._call( + endpoint, params=params, public=public, log=False) + except Exception: + if not self.refresh_token_key: + raise + return self._call(endpoint, params=params, public=False) def deviation_metadata(self, deviations): """ Fetch deviation metadata for a set of deviations""" @@ -1518,7 +1536,7 @@ class DeviantartOAuthAPI(): refresh_token_key, data["refresh_token"]) return "Bearer " + data["access_token"] - def _call(self, endpoint, fatal=True, public=None, **kwargs): + def _call(self, endpoint, fatal=True, log=True, public=None, **kwargs): """Call an API endpoint""" url = "https://www.deviantart.com/api/v1/oauth2" + endpoint kwargs["fatal"] = None @@ -1563,7 +1581,8 @@ class DeviantartOAuthAPI(): "cs/configuration.rst#extractordeviantartclient-id" "--client-secret") else: - self.log.error(msg) + if log: + self.log.error(msg) return data def _pagination(self, endpoint, params, @@ -1571,15 +1590,14 @@ class DeviantartOAuthAPI(): warn = True if public is None: public = self.public - elif not public: - self.public = False while True: data = self._call(endpoint, params=params, public=public) - if key not in data: + try: + results = data[key] + except KeyError: self.log.error("Unexpected API response: %s", data) return - results = data[key] if unpack: results = [item["journal"] for item in results @@ -1588,7 +1606,7 @@ class DeviantartOAuthAPI(): if public and len(results) < params["limit"]: if self.refresh_token_key: self.log.debug("Switching to private access token") - self.public = public = False + public = False continue elif data["has_more"] and warn: warn = False @@ -1859,7 +1877,7 @@ def _login_impl(extr, username, password): return { cookie.name: cookie.value - for cookie in extr.session.cookies + for cookie in extr.cookies } diff --git a/gallery_dl/extractor/e621.py b/gallery_dl/extractor/e621.py index 8f2994e4..cb1aea40 100644 --- a/gallery_dl/extractor/e621.py +++ b/gallery_dl/extractor/e621.py @@ -57,6 +57,8 @@ class E621Extractor(danbooru.DanbooruExtractor): post["filename"] = file["md5"] post["extension"] = file["ext"] + post["date"] = text.parse_datetime( + post["created_at"], "%Y-%m-%dT%H:%M:%S.%f%z") post.update(data) yield Message.Directory, post @@ -72,6 +74,10 @@ BASE_PATTERN = E621Extractor.update({ "root": "https://e926.net", "pattern": r"e926\.net", }, + "e6ai": { + "root": "https://e6ai.net", + "pattern": r"e6ai\.net", + }, }) @@ -92,6 +98,10 @@ class E621TagExtractor(E621Extractor, danbooru.DanbooruTagExtractor): }), ("https://e926.net/post/index/1/anry"), ("https://e926.net/post?tags=anry"), + + ("https://e6ai.net/posts?tags=anry"), + ("https://e6ai.net/post/index/1/anry"), + ("https://e6ai.net/post?tags=anry"), ) @@ -110,6 +120,11 @@ class E621PoolExtractor(E621Extractor, danbooru.DanbooruPoolExtractor): "content": "91abe5d5334425d9787811d7f06d34c77974cd22", }), ("https://e926.net/pool/show/73"), + + ("https://e6ai.net/pools/3", { + "url": "a6d1ad67a3fa9b9f73731d34d5f6f26f7e85855f", + }), + ("https://e6ai.net/pool/show/3"), ) def posts(self): @@ -140,6 +155,7 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): ("https://e621.net/posts/535", { "url": "f7f78b44c9b88f8f09caac080adc8d6d9fdaa529", "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", + "keyword": {"date": "dt:2007-02-17 19:02:32"}, }), ("https://e621.net/posts/3181052", { "options": (("metadata", "notes,pools"),), @@ -189,6 +205,12 @@ class E621PostExtractor(E621Extractor, danbooru.DanbooruPostExtractor): "content": "66f46e96a893fba8e694c4e049b23c2acc9af462", }), ("https://e926.net/post/show/535"), + + ("https://e6ai.net/posts/23", { + "url": "3c85a806b3d9eec861948af421fe0e8ad6b8f881", + "content": "a05a484e4eb64637d56d751c02e659b4bc8ea5d5", + }), + ("https://e6ai.net/post/show/23"), ) def posts(self): @@ -213,12 +235,12 @@ class E621PopularExtractor(E621Extractor, danbooru.DanbooruPopularExtractor): "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+", "count": ">= 70", }), + + ("https://e6ai.net/explore/posts/popular"), ) def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination("/popular.json", self.params, True) + return self._pagination("/popular.json", self.params) class E621FavoriteExtractor(E621Extractor): @@ -239,6 +261,8 @@ class E621FavoriteExtractor(E621Extractor): "pattern": r"https://static\d.e926.net/data/../../[0-9a-f]+", "count": "> 260", }), + + ("https://e6ai.net/favorites"), ) def __init__(self, match): @@ -249,6 +273,4 @@ class E621FavoriteExtractor(E621Extractor): return {"user_id": self.query.get("user_id", "")} def posts(self): - if self.page_start is None: - self.page_start = 1 - return self._pagination("/favorites.json", self.query, True) + return self._pagination("/favorites.json", self.query) diff --git a/gallery_dl/extractor/erome.py b/gallery_dl/extractor/erome.py index 03307f89..cb527410 100644 --- a/gallery_dl/extractor/erome.py +++ b/gallery_dl/extractor/erome.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -65,7 +65,7 @@ class EromeExtractor(Extractor): def request(self, url, **kwargs): if self.__cookies: self.__cookies = False - self.session.cookies.update(_cookie_cache()) + self.cookies.update(_cookie_cache()) for _ in range(5): response = Extractor.request(self, url, **kwargs) @@ -80,7 +80,7 @@ class EromeExtractor(Extractor): for params["page"] in itertools.count(1): page = self.request(url, params=params).text - album_ids = EromeAlbumExtractor.pattern.findall(page) + album_ids = EromeAlbumExtractor.pattern.findall(page)[::2] yield from album_ids if len(album_ids) < 36: diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index dccc74e4..d5f1d02b 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2014-2022 Mike Fährmann +# Copyright 2014-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -21,28 +21,31 @@ class ExhentaiExtractor(Extractor): """Base class for exhentai extractors""" category = "exhentai" directory_fmt = ("{category}", "{gid} {title[:247]}") - filename_fmt = ( - "{gid}_{num:>04}_{image_token}_{filename}.{extension}") + filename_fmt = "{gid}_{num:>04}_{image_token}_{filename}.{extension}" archive_fmt = "{gid}_{num}" - cookienames = ("ipb_member_id", "ipb_pass_hash") - cookiedomain = ".exhentai.org" + cookies_domain = ".exhentai.org" + cookies_names = ("ipb_member_id", "ipb_pass_hash") root = "https://exhentai.org" request_interval = 5.0 LIMIT = False def __init__(self, match): - # allow calling 'self.config()' before 'Extractor.__init__()' - self._cfgpath = ("extractor", self.category, self.subcategory) + Extractor.__init__(self, match) + self.version = match.group(1) - version = match.group(1) + def initialize(self): domain = self.config("domain", "auto") if domain == "auto": - domain = ("ex" if version == "ex" else "e-") + "hentai.org" + domain = ("ex" if self.version == "ex" else "e-") + "hentai.org" self.root = "https://" + domain - self.cookiedomain = "." + domain + self.cookies_domain = "." + domain - Extractor.__init__(self, match) + Extractor.initialize(self) + + if self.version != "ex": + self.cookies.set("nw", "1", domain=self.cookies_domain) + self.session.headers["Referer"] = self.root + "/" self.original = self.config("original", True) limits = self.config("limits", False) @@ -52,14 +55,10 @@ class ExhentaiExtractor(Extractor): else: self.limits = False - self.session.headers["Referer"] = self.root + "/" - if version != "ex": - self.session.cookies.set("nw", "1", domain=self.cookiedomain) - - def request(self, *args, **kwargs): - response = Extractor.request(self, *args, **kwargs) - if self._is_sadpanda(response): - self.log.info("sadpanda.jpg") + def request(self, url, **kwargs): + response = Extractor.request(self, url, **kwargs) + if response.history and response.headers.get("Content-Length") == "0": + self.log.info("blank page") raise exception.AuthorizationError() return response @@ -67,17 +66,20 @@ class ExhentaiExtractor(Extractor): """Login and set necessary cookies""" if self.LIMIT: raise exception.StopExtraction("Image limit reached!") - if self._check_cookies(self.cookienames): + + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) - else: - self.log.info("no username given; using e-hentai.org") - self.root = "https://e-hentai.org" - self.original = False - self.limits = False - self.session.cookies["nw"] = "1" + return self.cookies_update(self._login_impl(username, password)) + + self.log.info("no username given; using e-hentai.org") + self.root = "https://e-hentai.org" + self.cookies_domain = ".e-hentai.org" + self.cookies.set("nw", "1", domain=self.cookies_domain) + self.original = False + self.limits = False @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -98,15 +100,7 @@ class ExhentaiExtractor(Extractor): response = self.request(url, method="POST", headers=headers, data=data) if b"You are now logged in as:" not in response.content: raise exception.AuthenticationError() - return {c: response.cookies[c] for c in self.cookienames} - - @staticmethod - def _is_sadpanda(response): - """Return True if the response object contains a sad panda""" - return ( - response.headers.get("Content-Length") == "9615" and - "sadpanda.jpg" in response.headers.get("Content-Disposition", "") - ) + return {c: response.cookies[c] for c in self.cookies_names} class ExhentaiGalleryExtractor(ExhentaiExtractor): @@ -180,6 +174,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): self.image_token = match.group(4) self.image_num = text.parse_int(match.group(6), 1) + def _init(self): source = self.config("source") if source == "hitomi": self.items = self._items_hitomi @@ -399,8 +394,9 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): url = "https://e-hentai.org/home.php" cookies = { cookie.name: cookie.value - for cookie in self.session.cookies - if cookie.domain == self.cookiedomain and cookie.name != "igneous" + for cookie in self.cookies + if cookie.domain == self.cookies_domain and + cookie.name != "igneous" } page = self.request(url, cookies=cookies).text diff --git a/gallery_dl/extractor/fanbox.py b/gallery_dl/extractor/fanbox.py index 57c43338..921ddb62 100644 --- a/gallery_dl/extractor/fanbox.py +++ b/gallery_dl/extractor/fanbox.py @@ -6,9 +6,9 @@ """Extractors for https://www.fanbox.cc/""" -import re from .common import Extractor, Message from .. import text +import re BASE_PATTERN = ( @@ -27,14 +27,12 @@ class FanboxExtractor(Extractor): archive_fmt = "{id}_{num}" _warning = True - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.embeds = self.config("embeds", True) def items(self): - if self._warning: - if not self._check_cookies(("FANBOXSESSID",)): + if not self.cookies_check(("FANBOXSESSID",)): self.log.warning("no 'FANBOXSESSID' cookie set") FanboxExtractor._warning = False @@ -52,8 +50,11 @@ class FanboxExtractor(Extractor): url = text.ensure_http_scheme(url) body = self.request(url, headers=headers).json()["body"] for item in body["items"]: - yield self._get_post_data(item["id"]) - + try: + yield self._get_post_data(item["id"]) + except Exception as exc: + self.log.warning("Skipping post %s (%s: %s)", + item["id"], exc.__class__.__name__, exc) url = body["nextUrl"] def _get_post_data(self, post_id): @@ -211,9 +212,15 @@ class FanboxExtractor(Extractor): # to a proper Fanbox URL url = "https://www.pixiv.net/fanbox/"+content_id # resolve redirect - response = self.request(url, method="HEAD", allow_redirects=False) - url = response.headers["Location"] - final_post["_extractor"] = FanboxPostExtractor + try: + url = self.request(url, method="HEAD", + allow_redirects=False).headers["location"] + except Exception as exc: + url = None + self.log.warning("Unable to extract fanbox embed %s (%s: %s)", + content_id, exc.__class__.__name__, exc) + else: + final_post["_extractor"] = FanboxPostExtractor elif provider == "twitter": url = "https://twitter.com/_/status/"+content_id elif provider == "google_forms": diff --git a/gallery_dl/extractor/fantia.py b/gallery_dl/extractor/fantia.py index 13dfeada..3679e375 100644 --- a/gallery_dl/extractor/fantia.py +++ b/gallery_dl/extractor/fantia.py @@ -23,30 +23,54 @@ class FantiaExtractor(Extractor): self.headers = { "Accept" : "application/json, text/plain, */*", "Referer": self.root, + "X-Requested-With": "XMLHttpRequest", + } + _empty_plan = { + "id" : 0, + "price": 0, + "limit": 0, + "name" : "", + "description": "", + "thumb": self.root + "/images/fallback/plan/thumb_default.png", } if self._warning: - if not self._check_cookies(("_session_id",)): + if not self.cookies_check(("_session_id",)): self.log.warning("no '_session_id' cookie set") FantiaExtractor._warning = False for post_id in self.posts(): - full_response, post = self._get_post_data(post_id) - yield Message.Directory, post + post = self._get_post_data(post_id) post["num"] = 0 - for url, url_data in self._get_urls_from_post(full_response, post): - post["num"] += 1 - fname = url_data["content_filename"] or url - text.nameext_from_url(fname, url_data) - url_data["file_url"] = url - yield Message.Url, url, url_data + + for content in self._get_post_contents(post): + post["content_category"] = content["category"] + post["content_title"] = content["title"] + post["content_filename"] = content.get("filename", "") + post["content_id"] = content["id"] + post["plan"] = content["plan"] or _empty_plan + yield Message.Directory, post + + if content["visible_status"] != "visible": + self.log.warning( + "Unable to download '%s' files from " + "%s#post-content-id-%s", content["visible_status"], + post["post_url"], content["id"]) + + for url in self._get_content_urls(post, content): + text.nameext_from_url( + post["content_filename"] or url, post) + post["file_url"] = url + post["num"] += 1 + yield Message.Url, url, post def posts(self): """Return post IDs""" def _pagination(self, url): params = {"page": 1} - headers = self.headers + headers = self.headers.copy() + del headers["X-Requested-With"] while True: page = self.request(url, params=params, headers=headers).text @@ -71,7 +95,7 @@ class FantiaExtractor(Extractor): """Fetch and process post data""" url = self.root+"/api/v1/posts/"+post_id resp = self.request(url, headers=self.headers).json()["post"] - post = { + return { "post_id": resp["id"], "post_url": self.root + "/posts/" + str(resp["id"]), "post_title": resp["title"], @@ -85,55 +109,65 @@ class FantiaExtractor(Extractor): "fanclub_user_name": resp["fanclub"]["user"]["name"], "fanclub_name": resp["fanclub"]["name"], "fanclub_url": self.root+"/fanclubs/"+str(resp["fanclub"]["id"]), - "tags": resp["tags"] + "tags": resp["tags"], + "_data": resp, } - return resp, post - def _get_urls_from_post(self, resp, post): + def _get_post_contents(self, post): + contents = post["_data"]["post_contents"] + + try: + url = post["_data"]["thumb"]["original"] + except Exception: + pass + else: + contents.insert(0, { + "id": "thumb", + "title": "thumb", + "category": "thumb", + "download_uri": url, + "visible_status": "visible", + "plan": None, + }) + + return contents + + def _get_content_urls(self, post, content): """Extract individual URL data from the response""" - if "thumb" in resp and resp["thumb"] and "original" in resp["thumb"]: - post["content_filename"] = "" - post["content_category"] = "thumb" - post["file_id"] = "thumb" - yield resp["thumb"]["original"], post + if "comment" in content: + post["content_comment"] = content["comment"] - for content in resp["post_contents"]: - post["content_category"] = content["category"] - post["content_title"] = content["title"] - post["content_filename"] = content.get("filename", "") - post["content_id"] = content["id"] + if "post_content_photos" in content: + for photo in content["post_content_photos"]: + post["file_id"] = photo["id"] + yield photo["url"]["original"] - if "comment" in content: - post["content_comment"] = content["comment"] + if "download_uri" in content: + post["file_id"] = content["id"] + url = content["download_uri"] + if url[0] == "/": + url = self.root + url + yield url - if "post_content_photos" in content: - for photo in content["post_content_photos"]: - post["file_id"] = photo["id"] - yield photo["url"]["original"], post + if content["category"] == "blog" and "comment" in content: + comment_json = util.json_loads(content["comment"]) + ops = comment_json.get("ops") or () - if "download_uri" in content: - post["file_id"] = content["id"] - yield self.root+"/"+content["download_uri"], post + # collect blogpost text first + blog_text = "" + for op in ops: + insert = op.get("insert") + if isinstance(insert, str): + blog_text += insert + post["blogpost_text"] = blog_text - if content["category"] == "blog" and "comment" in content: - comment_json = util.json_loads(content["comment"]) - ops = comment_json.get("ops", ()) - - # collect blogpost text first - blog_text = "" - for op in ops: - insert = op.get("insert") - if isinstance(insert, str): - blog_text += insert - post["blogpost_text"] = blog_text - - # collect images - for op in ops: - insert = op.get("insert") - if isinstance(insert, dict) and "fantiaImage" in insert: - img = insert["fantiaImage"] - post["file_id"] = img["id"] - yield "https://fantia.jp" + img["original_url"], post + # collect images + for op in ops: + insert = op.get("insert") + if isinstance(insert, dict) and "fantiaImage" in insert: + img = insert["fantiaImage"] + post["file_id"] = img["id"] + yield self.root + img["original_url"] class FantiaCreatorExtractor(FantiaExtractor): diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index e85d68ac..3b18c63e 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -20,12 +20,16 @@ class FlickrExtractor(Extractor): filename_fmt = "{category}_{id}.{extension}" directory_fmt = ("{category}", "{user[username]}") archive_fmt = "{id}" - cookiedomain = None + cookies_domain = None + request_interval = (1.0, 2.0) + request_interval_min = 0.2 def __init__(self, match): Extractor.__init__(self, match) - self.api = FlickrAPI(self) self.item_id = match.group(1) + + def _init(self): + self.api = FlickrAPI(self) self.user = None def items(self): @@ -106,6 +110,8 @@ class FlickrImageExtractor(FlickrExtractor): def items(self): photo = self.api.photos_getInfo(self.item_id) + if self.api.exif: + photo.update(self.api.photos_getExif(self.item_id)) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) @@ -287,8 +293,8 @@ class FlickrAPI(oauth.OAuth1API): """ API_URL = "https://api.flickr.com/services/rest/" - API_KEY = "ac4fd7aa98585b9eee1ba761c209de68" - API_SECRET = "3adb0f568dc68393" + API_KEY = "f8f78d1a40debf471f0b22fa2d00525f" + API_SECRET = "4f9dae1113e45556" FORMATS = [ ("o" , "Original" , None), ("6k", "X-Large 6K" , 6144), @@ -323,6 +329,7 @@ class FlickrAPI(oauth.OAuth1API): def __init__(self, extractor): oauth.OAuth1API.__init__(self, extractor) + self.exif = extractor.config("exif", False) self.videos = extractor.config("videos", True) self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): @@ -367,6 +374,11 @@ class FlickrAPI(oauth.OAuth1API): params = {"user_id": user_id} return self._pagination("people.getPhotos", params) + def photos_getExif(self, photo_id): + """Retrieves a list of EXIF/TIFF/GPS tags for a given photo.""" + params = {"photo_id": photo_id} + return self._call("photos.getExif", params)["photo"] + def photos_getInfo(self, photo_id): """Get information about a photo.""" params = {"photo_id": photo_id} @@ -451,9 +463,19 @@ class FlickrAPI(oauth.OAuth1API): return data def _pagination(self, method, params, key="photos"): - params["extras"] = ("description,date_upload,tags,views,media," - "path_alias,owner_name,") - params["extras"] += ",".join("url_" + fmt[0] for fmt in self.formats) + extras = ("description,date_upload,tags,views,media," + "path_alias,owner_name,") + includes = self.extractor.config("metadata") + if includes: + if isinstance(includes, (list, tuple)): + includes = ",".join(includes) + elif not isinstance(includes, str): + includes = ("license,date_taken,original_format,last_update," + "geo,machine_tags,o_dims") + extras = extras + includes + "," + extras += ",".join("url_" + fmt[0] for fmt in self.formats) + + params["extras"] = extras params["page"] = 1 while True: @@ -478,6 +500,9 @@ class FlickrAPI(oauth.OAuth1API): photo["views"] = text.parse_int(photo["views"]) photo["date"] = text.parse_timestamp(photo["dateupload"]) photo["tags"] = photo["tags"].split() + + if self.exif: + photo.update(self.photos_getExif(photo["id"])) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 4f9a6bf5..fefb2c4c 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2022 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,10 +22,12 @@ class FoolfuukaExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) - self.session.headers["Referer"] = self.root if self.category == "b4k": self.remote = self._remote_direct + def _init(self): + self.session.headers["Referer"] = self.root + "/" + def items(self): yield Message.Directory, self.metadata() for post in self.posts(): @@ -88,13 +90,9 @@ BASE_PATTERN = FoolfuukaExtractor.update({ "root": "https://boards.fireden.net", "pattern": r"boards\.fireden\.net", }, - "rozenarcana": { - "root": "https://archive.alice.al", - "pattern": r"(?:archive\.)?alice\.al", - }, - "tokyochronos": { - "root": "https://www.tokyochronos.net", - "pattern": r"(?:www\.)?tokyochronos\.net", + "palanq": { + "root": "https://archive.palanq.win", + "pattern": r"archive\.palanq\.win", }, "rbt": { "root": "https://rbt.asia", @@ -137,11 +135,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): ("https://boards.fireden.net/sci/thread/11264294/", { "url": "61cab625c95584a12a30049d054931d64f8d20aa", }), - ("https://archive.alice.al/c/thread/2849220/", { - "url": "632e2c8de05de6b3847685f4bf1b4e5c6c9e0ed5", - }), - ("https://www.tokyochronos.net/a/thread/241664141/", { - "url": "ae03852cf44e3dcfce5be70274cb1828e1dbb7d6", + ("https://archive.palanq.win/c/thread/4209598/", { + "url": "1f9b5570d228f1f2991c827a6631030bc0e5933c", }), ("https://rbt.asia/g/thread/61487650/", { "url": "fadd274b25150a1bdf03a40c58db320fa3b617c4", @@ -187,8 +182,7 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/"), ("https://desuarchive.org/a/"), ("https://boards.fireden.net/sci/"), - ("https://archive.alice.al/c/"), - ("https://www.tokyochronos.net/a/"), + ("https://archive.palanq.win/c/"), ("https://rbt.asia/g/"), ("https://thebarchive.com/b/"), ) @@ -231,8 +225,7 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): ("https://archiveofsins.com/_/search/text/test/"), ("https://desuarchive.org/_/search/text/test/"), ("https://boards.fireden.net/_/search/text/test/"), - ("https://archive.alice.al/_/search/text/test/"), - ("https://www.tokyochronos.net/_/search/text/test/"), + ("https://archive.palanq.win/_/search/text/test/"), ("https://rbt.asia/_/search/text/test/"), ("https://thebarchive.com/_/search/text/test/"), ) @@ -297,8 +290,7 @@ class FoolfuukaGalleryExtractor(FoolfuukaExtractor): ("https://arch.b4k.co/meta/gallery/"), ("https://desuarchive.org/a/gallery/5"), ("https://boards.fireden.net/sci/gallery/6"), - ("https://archive.alice.al/c/gallery/7"), - ("https://www.tokyochronos.net/a/gallery/7"), + ("https://archive.palanq.win/c/gallery"), ("https://rbt.asia/g/gallery/8"), ("https://thebarchive.com/b/gallery/9"), ) diff --git a/gallery_dl/extractor/foolslide.py b/gallery_dl/extractor/foolslide.py index 4a38fb4f..57d37b76 100644 --- a/gallery_dl/extractor/foolslide.py +++ b/gallery_dl/extractor/foolslide.py @@ -42,11 +42,6 @@ BASE_PATTERN = FoolslideExtractor.update({ "root": "https://read.powermanga.org", "pattern": r"read(?:er)?\.powermanga\.org", }, - "sensescans": { - "root": "https://sensescans.com/reader", - "pattern": r"(?:(?:www\.)?sensescans\.com/reader" - r"|reader\.sensescans\.com)", - }, }) @@ -64,11 +59,6 @@ class FoolslideChapterExtractor(FoolslideExtractor): "url": "854c5817f8f767e1bccd05fa9d58ffb5a4b09384", "keyword": "a60c42f2634b7387899299d411ff494ed0ad6dbe", }), - ("https://sensescans.com/reader/read/ao_no_orchestra/en/0/26/", { - "url": "bbd428dc578f5055e9f86ad635b510386cd317cd", - "keyword": "083ef6f8831c84127fe4096fa340a249be9d1424", - }), - ("https://reader.sensescans.com/read/ao_no_orchestra/en/0/26/"), ) def items(self): @@ -129,9 +119,6 @@ class FoolslideMangaExtractor(FoolslideExtractor): "volume": int, }, }), - ("https://sensescans.com/reader/series/yotsubato/", { - "count": ">= 3", - }), ) def items(self): diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index cc43cec9..8c3ef79d 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -20,13 +20,16 @@ class FuraffinityExtractor(Extractor): directory_fmt = ("{category}", "{user!l}") filename_fmt = "{id}{title:? //}.{extension}" archive_fmt = "{id}" - cookiedomain = ".furaffinity.net" + cookies_domain = ".furaffinity.net" + cookies_names = ("a", "b") root = "https://www.furaffinity.net" _warning = True def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + + def _init(self): self.offset = 0 if self.config("descriptions") == "html": @@ -39,9 +42,8 @@ class FuraffinityExtractor(Extractor): self._new_layout = None def items(self): - if self._warning: - if not self._check_cookies(("a", "b")): + if not self.cookies_check(self.cookies_names): self.log.warning("no 'a' and 'b' session cookies set") FuraffinityExtractor._warning = False @@ -98,7 +100,9 @@ class FuraffinityExtractor(Extractor): 'class="tags-row">', '')) data["title"] = text.unescape(extr("

", "

")) data["artist"] = extr("", "<") - data["_description"] = extr('class="section-body">', '') + data["_description"] = extr( + 'class="submission-description user-submitted-links">', + ' ') data["views"] = pi(rh(extr('class="views">', '
'))) data["favorites"] = pi(rh(extr('class="favorites">', '
'))) data["comments"] = pi(rh(extr('class="comments">', ''))) @@ -125,7 +129,9 @@ class FuraffinityExtractor(Extractor): data["tags"] = text.split_html(extr( 'id="keywords">', ''))[::2] data["rating"] = extr('', ' ')
-            data[", "") + data["_description"] = extr( + '', ' ') data["artist_url"] = data["artist"].replace("_", "").lower() data["user"] = self.user or data["artist_url"] @@ -159,7 +165,13 @@ class FuraffinityExtractor(Extractor): while path: page = self.request(self.root + path).text - yield from text.extract_iter(page, 'id="sid-', '"') + extr = text.extract_from(page) + while True: + post_id = extr('id="sid-', '"') + if not post_id: + break + self._favorite_id = text.parse_int(extr('data-fav-id="', '"')) + yield post_id path = text.extr(page, 'right" href="', '"') def _pagination_search(self, query): @@ -241,6 +253,7 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): test = ("https://www.furaffinity.net/favorites/mirlinthloth/", { "pattern": r"https://d\d?\.f(uraffinity|acdn)\.net" r"/art/[^/]+/\d+/\d+.\w+\.\w+", + "keyword": {"favorite_id": int}, "range": "45-50", "count": 6, }) @@ -248,6 +261,12 @@ class FuraffinityFavoriteExtractor(FuraffinityExtractor): def posts(self): return self._pagination_favorites() + def _parse_post(self, post_id): + post = FuraffinityExtractor._parse_post(self, post_id) + if post: + post["favorite_id"] = self._favorite_id + return post + class FuraffinitySearchExtractor(FuraffinityExtractor): """Extractor for furaffinity search results""" @@ -354,7 +373,7 @@ class FuraffinityPostExtractor(FuraffinityExtractor): class FuraffinityUserExtractor(FuraffinityExtractor): """Extractor for furaffinity user profiles""" subcategory = "user" - cookiedomain = None + cookies_domain = None pattern = BASE_PATTERN + r"/user/([^/?#]+)" test = ( ("https://www.furaffinity.net/user/mirlinthloth/", { @@ -367,6 +386,9 @@ class FuraffinityUserExtractor(FuraffinityExtractor): }), ) + def initialize(self): + pass + def items(self): base = "{}/{{}}/{}/".format(self.root, self.user) return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/gelbooru_v01.py b/gallery_dl/extractor/gelbooru_v01.py index 9c19664e..b6fbcb6d 100644 --- a/gallery_dl/extractor/gelbooru_v01.py +++ b/gallery_dl/extractor/gelbooru_v01.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,29 +19,32 @@ class GelbooruV01Extractor(booru.BooruExtractor): def _parse_post(self, post_id): url = "{}/index.php?page=post&s=view&id={}".format( self.root, post_id) - page = self.request(url).text + extr = text.extract_from(self.request(url).text) - post = text.extract_all(page, ( - ("created_at", 'Posted: ', ' <'), - ("uploader" , 'By: ', ' <'), - ("width" , 'Size: ', 'x'), - ("height" , '', ' <'), - ("source" , 'Source: ', '<'), - ))[0] + post = { + "id" : post_id, + "created_at": extr('Posted: ', ' <'), + "uploader" : extr('By: ', ' <'), + "width" : extr('Size: ', 'x'), + "height" : extr('', ' <'), + "source" : extr('Source: ', ' <'), + "rating" : (extr('Rating: ', '<') or "?")[0].lower(), + "score" : extr('Score: ', ' <'), + "file_url" : extr('img', '<')), + } - post["id"] = post_id post["md5"] = post["file_url"].rpartition("/")[2].partition(".")[0] - post["rating"] = (post["rating"] or "?")[0].lower() - post["tags"] = text.unescape(post["tags"]) post["date"] = text.parse_datetime( post["created_at"], "%Y-%m-%d %H:%M:%S") return post + def skip(self, num): + self.page_start += num + return num + def _pagination(self, url, begin, end): pid = self.page_start @@ -75,9 +78,9 @@ BASE_PATTERN = GelbooruV01Extractor.update({ "root": "https://drawfriends.booru.org", "pattern": r"drawfriends\.booru\.org", }, - "vidyart": { - "root": "https://vidyart.booru.org", - "pattern": r"vidyart\.booru\.org", + "vidyart2": { + "root": "https://vidyart2.booru.org", + "pattern": r"vidyart2\.booru\.org", }, }) @@ -103,7 +106,7 @@ class GelbooruV01TagExtractor(GelbooruV01Extractor): "count": 25, }), ("https://drawfriends.booru.org/index.php?page=post&s=list&tags=all"), - ("https://vidyart.booru.org/index.php?page=post&s=list&tags=all"), + ("https://vidyart2.booru.org/index.php?page=post&s=list&tags=all"), ) def __init__(self, match): @@ -138,7 +141,7 @@ class GelbooruV01FavoriteExtractor(GelbooruV01Extractor): "count": 4, }), ("https://drawfriends.booru.org/index.php?page=favorites&s=view&id=1"), - ("https://vidyart.booru.org/index.php?page=favorites&s=view&id=1"), + ("https://vidyart2.booru.org/index.php?page=favorites&s=view&id=1"), ) def __init__(self, match): @@ -182,7 +185,7 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): "md5": "2aaa0438d58fc7baa75a53b4a9621bb89a9d3fdb", "rating": "s", "score": str, - "source": None, + "source": "", "tags": "blush dress green_eyes green_hair hatsune_miku " "long_hair twintails vocaloid", "uploader": "Honochi31", @@ -190,7 +193,7 @@ class GelbooruV01PostExtractor(GelbooruV01Extractor): }, }), ("https://drawfriends.booru.org/index.php?page=post&s=view&id=107474"), - ("https://vidyart.booru.org/index.php?page=post&s=view&id=383111"), + ("https://vidyart2.booru.org/index.php?page=post&s=view&id=39168"), ) def __init__(self, match): diff --git a/gallery_dl/extractor/gelbooru_v02.py b/gallery_dl/extractor/gelbooru_v02.py index 958c4b58..1ef78efd 100644 --- a/gallery_dl/extractor/gelbooru_v02.py +++ b/gallery_dl/extractor/gelbooru_v02.py @@ -19,8 +19,7 @@ import re class GelbooruV02Extractor(booru.BooruExtractor): basecategory = "gelbooru_v02" - def __init__(self, match): - booru.BooruExtractor.__init__(self, match) + def _init(self): self.api_key = self.config("api-key") self.user_id = self.config("user-id") diff --git a/gallery_dl/extractor/gfycat.py b/gallery_dl/extractor/gfycat.py index 0ccd7fa5..53ef1180 100644 --- a/gallery_dl/extractor/gfycat.py +++ b/gallery_dl/extractor/gfycat.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,6 +10,7 @@ from .common import Extractor, Message from .. import text, exception +from ..cache import cache class GfycatExtractor(Extractor): @@ -23,6 +24,7 @@ class GfycatExtractor(Extractor): Extractor.__init__(self, match) self.key = match.group(1).lower() + def _init(self): formats = self.config("format") if formats is None: formats = ("mp4", "webm", "mobile", "gif") @@ -80,6 +82,8 @@ class GfycatUserExtractor(GfycatExtractor): }) def gfycats(self): + if self.key == "me": + return GfycatAPI(self).me() return GfycatAPI(self).user(self.key) @@ -219,15 +223,8 @@ class GfycatAPI(): def __init__(self, extractor): self.extractor = extractor - - def gfycat(self, gfycat_id): - endpoint = "/v1/gfycats/" + gfycat_id - return self._call(endpoint)["gfyItem"] - - def user(self, user): - endpoint = "/v1/users/{}/gfycats".format(user.lower()) - params = {"count": 100} - return self._pagination(endpoint, params) + self.headers = {} + self.username, self.password = extractor._get_auth_info() def collection(self, user, collection): endpoint = "/v1/users/{}/collections/{}/gfycats".format( @@ -240,14 +237,64 @@ class GfycatAPI(): params = {"count": 100} return self._pagination(endpoint, params, "gfyCollections") + def gfycat(self, gfycat_id): + endpoint = "/v1/gfycats/" + gfycat_id + return self._call(endpoint)["gfyItem"] + + def me(self): + endpoint = "/v1/me/gfycats" + params = {"count": 100} + return self._pagination(endpoint, params) + def search(self, query): endpoint = "/v1/gfycats/search" params = {"search_text": query, "count": 150} return self._pagination(endpoint, params) + def user(self, user): + endpoint = "/v1/users/{}/gfycats".format(user.lower()) + params = {"count": 100} + return self._pagination(endpoint, params) + + def authenticate(self): + self.headers["Authorization"] = \ + self._authenticate_impl(self.username, self.password) + + @cache(maxage=3600, keyarg=1) + def _authenticate_impl(self, username, password): + self.extractor.log.info("Logging in as %s", username) + + url = "https://weblogin.gfycat.com/oauth/webtoken" + headers = {"Origin": "https://gfycat.com"} + data = { + "access_key": "Anr96uuqt9EdamSCwK4txKPjMsf2" + "M95Rfa5FLLhPFucu8H5HTzeutyAa", + } + response = self.extractor.request( + url, method="POST", headers=headers, json=data).json() + + url = "https://weblogin.gfycat.com/oauth/weblogin" + headers["authorization"] = "Bearer " + response["access_token"] + data = { + "grant_type": "password", + "username" : username, + "password" : password, + } + response = self.extractor.request( + url, method="POST", headers=headers, json=data, fatal=None).json() + + if "errorMessage" in response: + raise exception.AuthenticationError( + response["errorMessage"]["description"]) + return "Bearer " + response["access_token"] + def _call(self, endpoint, params=None): + if self.username: + self.authenticate() + url = self.API_ROOT + endpoint - return self.extractor.request(url, params=params).json() + return self.extractor.request( + url, params=params, headers=self.headers).json() def _pagination(self, endpoint, params, key="gfycats"): while True: diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index b53ebbe3..60886a9d 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -6,7 +6,8 @@ from .common import Extractor, Message from .. import text, exception -from ..cache import memcache +from ..cache import cache, memcache +import hashlib class GofileFolderExtractor(Extractor): @@ -66,19 +67,18 @@ class GofileFolderExtractor(Extractor): def items(self): recursive = self.config("recursive") + password = self.config("password") token = self.config("api-token") if not token: token = self._create_account() - self.session.cookies.set("accountToken", token, domain=".gofile.io") + self.cookies.set("accountToken", token, domain=".gofile.io") self.api_token = token - token = self.config("website-token", "12345") - if not token: - token = self._get_website_token() - self.website_token = token + self.website_token = (self.config("website-token") or + self._get_website_token()) - folder = self._get_content(self.content_id) + folder = self._get_content(self.content_id, password) yield Message.Directory, folder num = 0 @@ -109,17 +109,20 @@ class GofileFolderExtractor(Extractor): self.log.debug("Creating temporary account") return self._api_request("createAccount")["token"] - @memcache() + @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") - page = self.request(self.root + "/contents/files.html").text - return text.extract(page, "websiteToken:", ",")[0].strip("\" ") + page = self.request(self.root + "/dist/js/alljs.js").text + return text.extr(page, 'fetchData.websiteToken = "', '"') - def _get_content(self, content_id): + def _get_content(self, content_id, password=None): + if password is not None: + password = hashlib.sha256(password.encode()).hexdigest() return self._api_request("getContent", { "contentId" : content_id, "token" : self.api_token, "websiteToken": self.website_token, + "password" : password, }) def _api_request(self, endpoint, params=None): diff --git a/gallery_dl/extractor/hentaicosplays.py b/gallery_dl/extractor/hentaicosplays.py index 593a8464..ac03923f 100644 --- a/gallery_dl/extractor/hentaicosplays.py +++ b/gallery_dl/extractor/hentaicosplays.py @@ -57,7 +57,9 @@ class HentaicosplaysGalleryExtractor(GalleryExtractor): self.root = text.ensure_http_scheme(root) url = "{}/story/{}/".format(self.root, self.slug) GalleryExtractor.__init__(self, match, url) - self.session.headers["Referer"] = url + + def _init(self): + self.session.headers["Referer"] = self.gallery_url def metadata(self, page): title = text.extr(page, "", "") diff --git a/gallery_dl/extractor/hentaifoundry.py b/gallery_dl/extractor/hentaifoundry.py index e01a4ed8..56ea1d4d 100644 --- a/gallery_dl/extractor/hentaifoundry.py +++ b/gallery_dl/extractor/hentaifoundry.py @@ -20,7 +20,7 @@ class HentaifoundryExtractor(Extractor): directory_fmt = ("{category}", "{user}") filename_fmt = "{category}_{index}_{title}.{extension}" archive_fmt = "{index}" - cookiedomain = "www.hentai-foundry.com" + cookies_domain = "www.hentai-foundry.com" root = "https://www.hentai-foundry.com" per_page = 25 @@ -123,14 +123,14 @@ class HentaifoundryExtractor(Extractor): def _init_site_filters(self): """Set site-internal filters to show all images""" - if self.session.cookies.get("PHPSESSID", domain=self.cookiedomain): + if self.cookies.get("PHPSESSID", domain=self.cookies_domain): return url = self.root + "/?enterAgree=1" self.request(url, method="HEAD") - csrf_token = self.session.cookies.get( - "YII_CSRF_TOKEN", domain=self.cookiedomain) + csrf_token = self.cookies.get( + "YII_CSRF_TOKEN", domain=self.cookies_domain) if not csrf_token: self.log.warning("Unable to update site content filters") return @@ -170,6 +170,9 @@ class HentaifoundryUserExtractor(HentaifoundryExtractor): pattern = BASE_PATTERN + r"/user/([^/?#]+)/profile" test = ("https://www.hentai-foundry.com/user/Tenpura/profile",) + def initialize(self): + pass + def items(self): root = self.root user = "/user/" + self.user diff --git a/gallery_dl/extractor/hentaifox.py b/gallery_dl/extractor/hentaifox.py index ed8576f1..a1e681d1 100644 --- a/gallery_dl/extractor/hentaifox.py +++ b/gallery_dl/extractor/hentaifox.py @@ -45,6 +45,15 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): "type": "doujinshi", }, }), + # email-protected title (#4201) + ("https://hentaifox.com/gallery/35261/", { + "keyword": { + "gallery_id": 35261, + "title": "ManageM@ster!", + "artist": ["haritama hiroki"], + "group": ["studio n.ball"], + }, + }), ) def __init__(self, match): @@ -65,13 +74,14 @@ class HentaifoxGalleryExtractor(HentaifoxBase, GalleryExtractor): return { "gallery_id": text.parse_int(self.gallery_id), - "title" : text.unescape(extr("

", "

")), "parody" : split(extr(">Parodies:" , "")), "characters": split(extr(">Characters:", "")), "tags" : split(extr(">Tags:" , "")), "artist" : split(extr(">Artists:" , "")), "group" : split(extr(">Groups:" , "")), "type" : text.remove_html(extr(">Category:", "" in page: + url = "{}/p/{}/loadAll".format(self.root, self.gallery_id) + headers = { + "X-Requested-With": "XMLHttpRequest", + "Origin" : self.root, + "Referer" : self.gallery_url, + } + csrf_token = text.extr(page, 'name="csrf-token" content="', '"') + data = {"_token": csrf_token} + page += self.request( + url, method="POST", headers=headers, data=data).text + return [ (url, None) for url in text.extract_iter(page, 'data-url="', '"') ] + + def _metadata_api(self, page): + post = self.api.post(self.gallery_id) + + post["date"] = text.parse_datetime( + post["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + for img in post["images"]: + img["date"] = text.parse_datetime( + img["created"], "%Y-%m-%dT%H:%M:%S.%fZ") + + post["gallery_id"] = self.gallery_id + post.pop("image_count", None) + self._image_list = post.pop("images") + + return post + + def _images_api(self, page): + return [ + (img["link"], img) + for img in self._image_list + ] + + +class ImagechestAPI(): + """Interface for the Image Chest API + + https://imgchest.com/docs/api/1.0/general/overview + """ + root = "https://api.imgchest.com" + + def __init__(self, extractor, access_token): + self.extractor = extractor + self.headers = {"Authorization": "Bearer " + access_token} + + def file(self, file_id): + endpoint = "/v1/file/" + file_id + return self._call(endpoint) + + def post(self, post_id): + endpoint = "/v1/post/" + post_id + return self._call(endpoint) + + def user(self, username): + endpoint = "/v1/user/" + username + return self._call(endpoint) + + def _call(self, endpoint): + url = self.root + endpoint + + while True: + response = self.extractor.request( + url, headers=self.headers, fatal=None, allow_redirects=False) + + if response.status_code < 300: + return response.json()["data"] + + elif response.status_code < 400: + raise exception.AuthenticationError("Invalid API access token") + + elif response.status_code == 429: + self.extractor.wait(seconds=600) + + else: + self.extractor.log.debug(response.text) + raise exception.StopExtraction("API request failed") diff --git a/gallery_dl/extractor/imagefap.py b/gallery_dl/extractor/imagefap.py index c91347e6..43ac3a35 100644 --- a/gallery_dl/extractor/imagefap.py +++ b/gallery_dl/extractor/imagefap.py @@ -23,9 +23,8 @@ class ImagefapExtractor(Extractor): archive_fmt = "{gallery_id}_{image_id}" request_interval = (2.0, 4.0) - def __init__(self, match): - Extractor.__init__(self, match) - self.session.headers["Referer"] = self.root + def _init(self): + self.session.headers["Referer"] = self.root + "/" def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) @@ -283,7 +282,7 @@ class ImagefapFolderExtractor(ImagefapExtractor): yield gid, extr("", "<") cnt += 1 - if cnt < 25: + if cnt < 20: break params["page"] += 1 diff --git a/gallery_dl/extractor/imagehosts.py b/gallery_dl/extractor/imagehosts.py index 66112a94..8ef51b0a 100644 --- a/gallery_dl/extractor/imagehosts.py +++ b/gallery_dl/extractor/imagehosts.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,23 +19,23 @@ class ImagehostImageExtractor(Extractor): basecategory = "imagehost" subcategory = "image" archive_fmt = "{token}" - https = True - params = None - cookies = None - encoding = None + _https = True + _params = None + _cookies = None + _encoding = None def __init__(self, match): Extractor.__init__(self, match) self.page_url = "http{}://{}".format( - "s" if self.https else "", match.group(1)) + "s" if self._https else "", match.group(1)) self.token = match.group(2) - if self.params == "simple": - self.params = { + if self._params == "simple": + self._params = { "imgContinue": "Continue+to+image+...+", } - elif self.params == "complex": - self.params = { + elif self._params == "complex": + self._params = { "op": "view", "id": self.token, "pre": "1", @@ -46,16 +46,16 @@ class ImagehostImageExtractor(Extractor): def items(self): page = self.request( self.page_url, - method=("POST" if self.params else "GET"), - data=self.params, - cookies=self.cookies, - encoding=self.encoding, + method=("POST" if self._params else "GET"), + data=self._params, + cookies=self._cookies, + encoding=self._encoding, ).text url, filename = self.get_info(page) data = text.nameext_from_url(filename, {"token": self.token}) data.update(self.metadata(page)) - if self.https and url.startswith("http:"): + if self._https and url.startswith("http:"): url = "https:" + url[5:] yield Message.Directory, data @@ -102,8 +102,8 @@ class ImxtoImageExtractor(ImagehostImageExtractor): "exception": exception.NotFoundError, }), ) - params = "simple" - encoding = "utf-8" + _params = "simple" + _encoding = "utf-8" def __init__(self, match): ImagehostImageExtractor.__init__(self, match) @@ -153,8 +153,9 @@ class ImxtoGalleryExtractor(ImagehostImageExtractor): "_extractor": ImxtoImageExtractor, "title": text.unescape(title.partition(">")[2]).strip(), } - for url in text.extract_iter(page, '
', '<', pos) return url, text.unescape(filename) +class PostimgGalleryExtractor(ImagehostImageExtractor): + """Extractor for images galleries from postimages.org""" + category = "postimg" + subcategory = "gallery" + pattern = (r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.(?:cc|org)" + r"/(?:gallery/)([^/?#]+)/?)") + test = ("https://postimg.cc/gallery/wxpDLgX", { + "pattern": PostimgImageExtractor.pattern, + "count": 22, + }) + + def items(self): + page = self.request(self.page_url).text + data = {"_extractor": PostimgImageExtractor} + for url in text.extract_iter(page, ' class="thumb"><')[0] + + +class JpgfishImageExtractor(JpgfishExtractor): + """Extractor for jpgfish Images""" + subcategory = "image" + pattern = BASE_PATTERN + r"/img/((?:[^/?#]+\.)?(\w+))" + test = ( + ("https://jpeg.pet/img/funnymeme.LecXGS", { + "pattern": r"https://simp3\.jpg\.church/images/funnymeme\.jpg", + "content": "098e5e9b17ad634358426e0ffd1c93871474d13c", + "keyword": { + "album": "", + "extension": "jpg", + "filename": "funnymeme", + "id": "LecXGS", + "url": "https://simp3.jpg.church/images/funnymeme.jpg", + "user": "exearco", + }, + }), + ("https://jpg.church/img/auCruA", { + "pattern": r"https://simp2\.jpg\.church/hannahowo_00457\.jpg", + "keyword": {"album": "401-500"}, + }), + ("https://jpg.pet/img/funnymeme.LecXGS"), + ("https://jpg.fishing/img/funnymeme.LecXGS"), + ("https://jpg.fish/img/funnymeme.LecXGS"), + ("https://jpg.church/img/funnymeme.LecXGS"), + ) + + def __init__(self, match): + JpgfishExtractor.__init__(self, match) + self.path, self.image_id = match.groups() + + def items(self): + url = "{}/img/{}".format(self.root, self.path) + extr = text.extract_from(self.request(url).text) + + image = { + "id" : self.image_id, + "url" : extr('"), ">", "<")[0] or "", + "user" : extr('username: "', '"'), + } + + text.nameext_from_url(image["url"], image) + yield Message.Directory, image + yield Message.Url, image["url"], image + + +class JpgfishAlbumExtractor(JpgfishExtractor): + """Extractor for jpgfish Albums""" + subcategory = "album" + pattern = BASE_PATTERN + r"/a(?:lbum)?/([^/?#]+)(/sub)?" + test = ( + ("https://jpeg.pet/album/CDilP/?sort=date_desc&page=1", { + "count": 2, + }), + ("https://jpg.fishing/a/gunggingnsk.N9OOI", { + "count": 114, + }), + ("https://jpg.fish/a/101-200.aNJ6A/", { + "count": 100, + }), + ("https://jpg.church/a/hannahowo.aNTdH/sub", { + "count": 606, + }), + ("https://jpg.pet/album/CDilP/?sort=date_desc&page=1"), + ) + + def __init__(self, match): + JpgfishExtractor.__init__(self, match) + self.album, self.sub_albums = match.groups() + + def items(self): + url = "{}/a/{}".format(self.root, self.album) + data = {"_extractor": JpgfishImageExtractor} + + if self.sub_albums: + albums = self._pagination(url + "/sub") + else: + albums = (url,) + + for album in albums: + for image in self._pagination(album): + yield Message.Queue, image, data + + +class JpgfishUserExtractor(JpgfishExtractor): + """Extractor for jpgfish Users""" + subcategory = "user" + pattern = BASE_PATTERN + r"/(?!img|a(?:lbum)?)([^/?#]+)(/albums)?" + test = ( + ("https://jpeg.pet/exearco", { + "count": 3, + }), + ("https://jpg.church/exearco/albums", { + "count": 1, + }), + ("https://jpg.pet/exearco"), + ("https://jpg.fishing/exearco"), + ("https://jpg.fish/exearco"), + ("https://jpg.church/exearco"), + ) + + def __init__(self, match): + JpgfishExtractor.__init__(self, match) + self.user, self.albums = match.groups() + + def items(self): + url = "{}/{}".format(self.root, self.user) + + if self.albums: + url += "/albums" + data = {"_extractor": JpgfishAlbumExtractor} + else: + data = {"_extractor": JpgfishImageExtractor} + + for url in self._pagination(url): + yield Message.Queue, url, data diff --git a/gallery_dl/extractor/jschan.py b/gallery_dl/extractor/jschan.py new file mode 100644 index 00000000..fe758faa --- /dev/null +++ b/gallery_dl/extractor/jschan.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for jschan Imageboards""" + +from .common import BaseExtractor, Message +from .. import text +import itertools + + +class JschanExtractor(BaseExtractor): + basecategory = "jschan" + + +BASE_PATTERN = JschanExtractor.update({ + "94chan": { + "root": "https://94chan.org", + "pattern": r"94chan\.org" + } +}) + + +class JschanThreadExtractor(JschanExtractor): + """Extractor for jschan threads""" + subcategory = "thread" + directory_fmt = ("{category}", "{board}", + "{threadId} {subject|nomarkup[:50]}") + filename_fmt = "{postId}{num:?-//} {filename}.{extension}" + archive_fmt = "{board}_{postId}_{num}" + pattern = BASE_PATTERN + r"/([^/?#]+)/thread/(\d+)\.html" + test = ( + ("https://94chan.org/art/thread/25.html", { + "pattern": r"https://94chan.org/file/[0-9a-f]{64}(\.\w+)?", + "count": ">= 15" + }) + ) + + def __init__(self, match): + JschanExtractor.__init__(self, match) + index = match.lastindex + self.board = match.group(index-1) + self.thread = match.group(index) + + def items(self): + url = "{}/{}/thread/{}.json".format( + self.root, self.board, self.thread) + thread = self.request(url).json() + thread["threadId"] = thread["postId"] + posts = thread.pop("replies", ()) + + yield Message.Directory, thread + for post in itertools.chain((thread,), posts): + files = post.pop("files", ()) + if files: + thread.update(post) + thread["count"] = len(files) + for num, file in enumerate(files): + url = self.root + "/file/" + file["filename"] + file.update(thread) + file["num"] = num + file["siteFilename"] = file["filename"] + text.nameext_from_url(file["originalFilename"], file) + yield Message.Url, url, file + + +class JschanBoardExtractor(JschanExtractor): + """Extractor for jschan boards""" + subcategory = "board" + pattern = (BASE_PATTERN + r"/([^/?#]+)" + r"(?:/index\.html|/catalog\.html|/\d+\.html|/?$)") + test = ( + ("https://94chan.org/art/", { + "pattern": JschanThreadExtractor.pattern, + "count": ">= 30" + }), + ("https://94chan.org/art/2.html"), + ("https://94chan.org/art/catalog.html"), + ("https://94chan.org/art/index.html"), + ) + + def __init__(self, match): + JschanExtractor.__init__(self, match) + self.board = match.group(match.lastindex) + + def items(self): + url = "{}/{}/catalog.json".format(self.root, self.board) + for thread in self.request(url).json(): + url = "{}/{}/thread/{}.html".format( + self.root, self.board, thread["postId"]) + thread["_extractor"] = JschanThreadExtractor + yield Message.Queue, url, thread diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 33e8370a..2ed73e9c 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -14,7 +14,7 @@ from ..cache import cache import itertools import re -BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.party" +BASE_PATTERN = r"(?:https?://)?(?:www\.|beta\.)?(kemono|coomer)\.(party|su)" USER_PATTERN = BASE_PATTERN + r"/([^/?#]+)/user/([^/?#]+)" HASH_PATTERN = r"/[0-9a-f]{2}/[0-9a-f]{2}/([0-9a-f]{64})" @@ -26,22 +26,24 @@ class KemonopartyExtractor(Extractor): directory_fmt = ("{category}", "{service}", "{user}") filename_fmt = "{id}_{title}_{num:>02}_{filename[:180]}.{extension}" archive_fmt = "{service}_{user}_{id}_{num}" - cookiedomain = ".kemono.party" + cookies_domain = ".kemono.party" def __init__(self, match): - if match.group(1) == "coomer": - self.category = "coomerparty" - self.cookiedomain = ".coomer.party" + domain = match.group(1) + tld = match.group(2) + self.category = domain + "party" self.root = text.root_from_url(match.group(0)) + self.cookies_domain = ".{}.{}".format(domain, tld) Extractor.__init__(self, match) + + def _init(self): self.session.headers["Referer"] = self.root + "/" + self._prepare_ddosguard_cookies() + self._find_inline = re.compile( + r'src="(?:https?://(?:kemono|coomer)\.(?:party|su))?(/inline/[^"]+' + r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall def items(self): - self._prepare_ddosguard_cookies() - - self._find_inline = re.compile( - r'src="(?:https?://(?:kemono|coomer)\.party)?(/inline/[^"]+' - r'|/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{64}\.[^"]+)').findall find_hash = re.compile(HASH_PATTERN).match generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") @@ -125,10 +127,12 @@ class KemonopartyExtractor(Extractor): def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl( + (username, self.cookies_domain), password)) @cache(maxage=28*24*3600, keyarg=1) def _login_impl(self, username, password): + username = username[0] self.log.info("Logging in as %s", username) url = self.root + "/account/login" @@ -222,11 +226,12 @@ class KemonopartyUserExtractor(KemonopartyExtractor): "options": (("max-posts", 25),), "count": "< 100", }), + ("https://kemono.su/subscribestar/user/alcorart"), ("https://kemono.party/subscribestar/user/alcorart"), ) def __init__(self, match): - _, service, user_id, offset = match.groups() + _, _, service, user_id, offset = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/{}/user/{}".format(self.root, service, user_id) @@ -327,13 +332,14 @@ class KemonopartyPostExtractor(KemonopartyExtractor): r"f51c10adc9dabd86e92bd52339f298b9\.txt", "content": "da39a3ee5e6b4b0d3255bfef95601890afd80709", # empty }), + ("https://kemono.su/subscribestar/user/alcorart/post/184330"), ("https://kemono.party/subscribestar/user/alcorart/post/184330"), ("https://www.kemono.party/subscribestar/user/alcorart/post/184330"), ("https://beta.kemono.party/subscribestar/user/alcorart/post/184330"), ) def __init__(self, match): - _, service, user_id, post_id = match.groups() + _, _, service, user_id, post_id = match.groups() self.subcategory = service KemonopartyExtractor.__init__(self, match) self.api_url = "{}/api/{}/user/{}/post/{}".format( @@ -359,9 +365,9 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): "count": 4, "keyword": {"channel_name": "finish-work"}, }), - (("https://kemono.party/discord" + (("https://kemono.su/discord" "/server/256559665620451329/channel/462437519519383555#"), { - "pattern": r"https://kemono\.party/data/(" + "pattern": r"https://kemono\.su/data/(" r"e3/77/e377e3525164559484ace2e64425b0cec1db08.*\.png|" r"51/45/51453640a5e0a4d23fbf57fb85390f9c5ec154.*\.gif)", "keyword": {"hash": "re:e377e3525164559484ace2e64425b0cec1db08" @@ -380,7 +386,7 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - _, self.server, self.channel, self.channel_name = match.groups() + _, _, self.server, self.channel, self.channel_name = match.groups() def items(self): self._prepare_ddosguard_cookies() @@ -455,14 +461,20 @@ class KemonopartyDiscordExtractor(KemonopartyExtractor): class KemonopartyDiscordServerExtractor(KemonopartyExtractor): subcategory = "discord-server" pattern = BASE_PATTERN + r"/discord/server/(\d+)$" - test = ("https://kemono.party/discord/server/488668827274444803", { - "pattern": KemonopartyDiscordExtractor.pattern, - "count": 13, - }) + test = ( + ("https://kemono.party/discord/server/488668827274444803", { + "pattern": KemonopartyDiscordExtractor.pattern, + "count": 13, + }), + ("https://kemono.su/discord/server/488668827274444803", { + "pattern": KemonopartyDiscordExtractor.pattern, + "count": 13, + }), + ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.server = match.group(2) + self.server = match.group(3) def items(self): url = "{}/api/discord/channels/lookup?q={}".format( @@ -491,11 +503,16 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): "url": "ecfccf5f0d50b8d14caa7bbdcf071de5c1e5b90f", "count": 3, }), + ("https://kemono.su/favorites?type=post", { + "pattern": KemonopartyPostExtractor.pattern, + "url": "4be8e84cb384a907a8e7997baaf6287b451783b5", + "count": 3, + }), ) def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(2)).get("type") or + self.favorites = (text.parse_query(match.group(3)).get("type") or self.config("favorites") or "artist") diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py new file mode 100644 index 00000000..43fc24e1 --- /dev/null +++ b/gallery_dl/extractor/lensdump.py @@ -0,0 +1,161 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://lensdump.com/""" + +from .common import GalleryExtractor, Extractor, Message +from .. import text, util + +BASE_PATTERN = r"(?:https?://)?lensdump\.com" + + +class LensdumpBase(): + """Base class for lensdump extractors""" + category = "lensdump" + root = "https://lensdump.com" + + def nodes(self, page=None): + if page is None: + page = self.request(self.url).text + + # go through all pages starting from the oldest + page_url = text.urljoin(self.root, text.extr( + text.extr(page, ' id="list-most-oldest-link"', '>'), + 'href="', '"')) + while page_url is not None: + if page_url == self.url: + current_page = page + else: + current_page = self.request(page_url).text + + for node in text.extract_iter( + current_page, ' class="list-item ', '>'): + yield node + + # find url of next page + page_url = text.extr( + text.extr(current_page, ' data-pagination="next"', '>'), + 'href="', '"') + if page_url is not None and len(page_url) > 0: + page_url = text.urljoin(self.root, page_url) + else: + page_url = None + + +class LensdumpAlbumExtractor(LensdumpBase, GalleryExtractor): + subcategory = "album" + pattern = BASE_PATTERN + r"/(?:((?!\w+/albums|a/|i/)\w+)|a/(\w+))" + test = ( + ("https://lensdump.com/a/1IhJr", { + "pattern": r"https://[abcd]\.l3n\.co/i/tq\w{4}\.png", + "keyword": { + "extension": "png", + "name": str, + "num": int, + "title": str, + "url": str, + "width": int, + }, + }), + ) + + def __init__(self, match): + GalleryExtractor.__init__(self, match, match.string) + self.gallery_id = match.group(1) or match.group(2) + + def metadata(self, page): + return { + "gallery_id": self.gallery_id, + "title": text.unescape(text.extr( + page, 'property="og:title" content="', '"').strip()) + } + + def images(self, page): + for node in self.nodes(page): + # get urls and filenames of images in current page + json_data = util.json_loads(text.unquote( + text.extr(node, "data-object='", "'") or + text.extr(node, 'data-object="', '"'))) + image_id = json_data.get('name') + image_url = json_data.get('url') + image_title = json_data.get('title') + if image_title is not None: + image_title = text.unescape(image_title) + yield (image_url, { + 'id': image_id, + 'url': image_url, + 'title': image_title, + 'name': json_data.get('filename'), + 'filename': image_id, + 'extension': json_data.get('extension'), + 'height': text.parse_int(json_data.get('height')), + 'width': text.parse_int(json_data.get('width')), + }) + + +class LensdumpAlbumsExtractor(LensdumpBase, Extractor): + """Extractor for album list from lensdump.com""" + subcategory = "albums" + pattern = BASE_PATTERN + r"/\w+/albums" + test = ("https://lensdump.com/vstar925/albums",) + + def items(self): + for node in self.nodes(): + album_url = text.urljoin(self.root, text.extr( + node, 'data-url-short="', '"')) + yield Message.Queue, album_url, { + "_extractor": LensdumpAlbumExtractor} + + +class LensdumpImageExtractor(LensdumpBase, Extractor): + """Extractor for individual images on lensdump.com""" + subcategory = "image" + filename_fmt = "{category}_{id}{title:?_//}.{extension}" + directory_fmt = ("{category}",) + archive_fmt = "{id}" + pattern = BASE_PATTERN + r"/i/(\w+)" + test = ( + ("https://lensdump.com/i/tyoAyM", { + "pattern": r"https://c\.l3n\.co/i/tyoAyM\.webp", + "content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", + "keyword": { + "date": "dt:2022-08-01 08:24:28", + "extension": "webp", + "filename": "tyoAyM", + "height": 400, + "id": "tyoAyM", + "title": "MYOBI clovis bookcaseset", + "url": "https://c.l3n.co/i/tyoAyM.webp", + "width": 620, + }, + }), + ) + + def __init__(self, match): + Extractor.__init__(self, match) + self.key = match.group(1) + + def items(self): + url = "{}/i/{}".format(self.root, self.key) + extr = text.extract_from(self.request(url).text) + + data = { + "id" : self.key, + "title" : text.unescape(extr( + 'property="og:title" content="', '"')), + "url" : extr( + 'property="og:image" content="', '"'), + "width" : text.parse_int(extr( + 'property="image:width" content="', '"')), + "height": text.parse_int(extr( + 'property="image:height" content="', '"')), + "date" : text.parse_datetime(extr( + '"), 1): - src = text.extr(img, 'src="', '"') - alt = text.extr(img, 'alt="', '"') - - if not src: - continue - if src.startswith("https://obs.line-scdn.") and src.count("/") > 3: - src = src.rpartition("/")[0] - - imgs.append(text.nameext_from_url(alt or src, { - "url" : src, - "num" : num, - "hash": src.rpartition("/")[2], - "post": post, - })) - - return imgs - - -class LineblogBlogExtractor(LineblogBase, LivedoorBlogExtractor): - """Extractor for a user's blog on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/?(?:$|[?#])" - test = ("https://lineblog.me/mamoru_miyano/", { - "range": "1-20", - "count": 20, - "pattern": r"https://obs.line-scdn.net/[\w-]+$", - "keyword": { - "post": { - "categories" : tuple, - "date" : "type:datetime", - "description": str, - "id" : int, - "tags" : list, - "title" : str, - "user" : "mamoru_miyano" - }, - "filename": str, - "hash" : r"re:\w{32,}", - "num" : int, - }, - }) - - -class LineblogPostExtractor(LineblogBase, LivedoorPostExtractor): - """Extractor for blog posts on lineblog.me""" - pattern = r"(?:https?://)?lineblog\.me/(\w+)/archives/(\d+)" - test = ("https://lineblog.me/mamoru_miyano/archives/1919150.html", { - "url": "24afeb4044c554f80c374b52bf8109c6f1c0c757", - "keyword": "76a38e2c0074926bd3362f66f9fc0e6c41591dcb", - }) diff --git a/gallery_dl/extractor/lolisafe.py b/gallery_dl/extractor/lolisafe.py index 5d236c37..9cebe3ae 100644 --- a/gallery_dl/extractor/lolisafe.py +++ b/gallery_dl/extractor/lolisafe.py @@ -46,9 +46,10 @@ class LolisafeAlbumExtractor(LolisafeExtractor): LolisafeExtractor.__init__(self, match) self.album_id = match.group(match.lastindex) + def _init(self): domain = self.config("domain") if domain == "auto": - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.url) elif domain: self.root = text.ensure_http_scheme(domain) diff --git a/gallery_dl/extractor/luscious.py b/gallery_dl/extractor/luscious.py index 57db0c9d..dcf09d16 100644 --- a/gallery_dl/extractor/luscious.py +++ b/gallery_dl/extractor/luscious.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -15,7 +15,7 @@ from .. import text, exception class LusciousExtractor(Extractor): """Base class for luscious extractors""" category = "luscious" - cookiedomain = ".luscious.net" + cookies_domain = ".luscious.net" root = "https://members.luscious.net" def _graphql(self, op, variables, query): @@ -118,6 +118,8 @@ class LusciousAlbumExtractor(LusciousExtractor): def __init__(self, match): LusciousExtractor.__init__(self, match) self.album_id = match.group(1) + + def _init(self): self.gif = self.config("gif", False) def items(self): diff --git a/gallery_dl/extractor/mangadex.py b/gallery_dl/extractor/mangadex.py index 409483b4..e12e56b4 100644 --- a/gallery_dl/extractor/mangadex.py +++ b/gallery_dl/extractor/mangadex.py @@ -30,9 +30,11 @@ class MangadexExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) + self.uuid = match.group(1) + + def _init(self): self.session.headers["User-Agent"] = util.USERAGENT self.api = MangadexAPI(self) - self.uuid = match.group(1) def items(self): for chapter in self.chapters(): @@ -85,6 +87,10 @@ class MangadexExtractor(Extractor): data["group"] = [group["attributes"]["name"] for group in relationships["scanlation_group"]] + data["status"] = mattributes["status"] + data["tags"] = [tag["attributes"]["name"]["en"] + for tag in mattributes["tags"]] + return data @@ -94,13 +100,13 @@ class MangadexChapterExtractor(MangadexExtractor): pattern = BASE_PATTERN + r"/chapter/([0-9a-f-]+)" test = ( ("https://mangadex.org/chapter/f946ac53-0b71-4b5d-aeb2-7931b13c4aaa", { - "keyword": "86fb262cf767dac6d965cd904ad499adba466404", + "keyword": "e86128a79ebe7201b648f1caa828496a2878dc8f", # "content": "50383a4c15124682057b197d40261641a98db514", }), # oneshot ("https://mangadex.org/chapter/61a88817-9c29-4281-bdf1-77b3c1be9831", { "count": 64, - "keyword": "6abcbe1e24eeb1049dc931958853cd767ee483fb", + "keyword": "d11ed057a919854696853362be35fc0ba7dded4c", }), # MANGA Plus (#1154) ("https://mangadex.org/chapter/74149a55-e7c4-44ea-8a37-98e879c1096f", { @@ -144,6 +150,7 @@ class MangadexMangaExtractor(MangadexExtractor): pattern = BASE_PATTERN + r"/(?:title|manga)/(?!feed$)([0-9a-f-]+)" test = ( ("https://mangadex.org/title/f90c4398-8aad-4f51-8a1f-024ca09fdcbc", { + "count": ">= 5", "keyword": { "manga" : "Souten no Koumori", "manga_id": "f90c4398-8aad-4f51-8a1f-024ca09fdcbc", @@ -157,6 +164,19 @@ class MangadexMangaExtractor(MangadexExtractor): "language": str, "artist" : ["Arakawa Hiromu"], "author" : ["Arakawa Hiromu"], + "status" : "completed", + "tags" : ["Oneshot", "Historical", "Action", + "Martial Arts", "Drama", "Tragedy"], + }, + }), + # mutliple values for 'lang' (#4093) + ("https://mangadex.org/title/f90c4398-8aad-4f51-8a1f-024ca09fdcbc", { + "options": (("lang", "fr,it"),), + "count": 2, + "keyword": { + "manga" : "Souten no Koumori", + "lang" : "re:fr|it", + "language": "re:French|Italian", }, }), ("https://mangadex.cc/manga/d0c88e3b-ea64-4e07-9841-c1d2ac982f4a/", { @@ -186,13 +206,16 @@ class MangadexFeedExtractor(MangadexExtractor): class MangadexAPI(): - """Interface for the MangaDex API v5""" + """Interface for the MangaDex API v5 + + https://api.mangadex.org/docs/ + """ def __init__(self, extr): self.extractor = extr self.headers = {} - self.username, self.password = self.extractor._get_auth_info() + self.username, self.password = extr._get_auth_info() if not self.username: self.authenticate = util.noop @@ -278,9 +301,13 @@ class MangadexAPI(): if ratings is None: ratings = ("safe", "suggestive", "erotica", "pornographic") + lang = config("lang") + if isinstance(lang, str) and "," in lang: + lang = lang.split(",") + params["contentRating[]"] = ratings + params["translatedLanguage[]"] = lang params["includes[]"] = ("scanlation_group",) - params["translatedLanguage[]"] = config("lang") params["offset"] = 0 api_params = config("api-parameters") diff --git a/gallery_dl/extractor/mangafox.py b/gallery_dl/extractor/mangafox.py index 0818fd90..8478b8de 100644 --- a/gallery_dl/extractor/mangafox.py +++ b/gallery_dl/extractor/mangafox.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -33,6 +33,8 @@ class MangafoxChapterExtractor(ChapterExtractor): base, self.cstr, self.volume, self.chapter, self.minor = match.groups() self.urlbase = self.root + base ChapterExtractor.__init__(self, match, self.urlbase + "/1.html") + + def _init(self): self.session.headers["Referer"] = self.root + "/" def metadata(self, page): diff --git a/gallery_dl/extractor/mangahere.py b/gallery_dl/extractor/mangahere.py index 531aef48..745231b1 100644 --- a/gallery_dl/extractor/mangahere.py +++ b/gallery_dl/extractor/mangahere.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2015-2022 Mike Fährmann +# Copyright 2015-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -42,6 +42,8 @@ class MangahereChapterExtractor(MangahereBase, ChapterExtractor): self.part, self.volume, self.chapter = match.groups() url = self.url_fmt.format(self.part, 1) ChapterExtractor.__init__(self, match, url) + + def _init(self): self.session.headers["Referer"] = self.root_mobile + "/" def metadata(self, page): @@ -112,9 +114,8 @@ class MangahereMangaExtractor(MangahereBase, MangaExtractor): ("https://m.mangahere.co/manga/aria/"), ) - def __init__(self, match): - MangaExtractor.__init__(self, match) - self.session.cookies.set("isAdult", "1", domain="www.mangahere.cc") + def _init(self): + self.cookies.set("isAdult", "1", domain="www.mangahere.cc") def chapters(self, page): results = [] diff --git a/gallery_dl/extractor/mangakakalot.py b/gallery_dl/extractor/mangakakalot.py index ba55ac16..e397586e 100644 --- a/gallery_dl/extractor/mangakakalot.py +++ b/gallery_dl/extractor/mangakakalot.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright 2020 Jake Mannens -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -39,7 +39,9 @@ class MangakakalotChapterExtractor(MangakakalotBase, ChapterExtractor): def __init__(self, match): self.path = match.group(1) ChapterExtractor.__init__(self, match, self.root + self.path) - self.session.headers['Referer'] = self.root + + def _init(self): + self.session.headers['Referer'] = self.root + "/" def metadata(self, page): _ , pos = text.extract(page, '', '<') diff --git a/gallery_dl/extractor/manganelo.py b/gallery_dl/extractor/manganelo.py index 5ba18a3e..807bc5ee 100644 --- a/gallery_dl/extractor/manganelo.py +++ b/gallery_dl/extractor/manganelo.py @@ -16,21 +16,28 @@ BASE_PATTERN = r"(?:https?://)?((?:chap|read|www\.|m\.)?mangan(?:at|el)o\.com)" class ManganeloBase(): category = "manganelo" root = "https://chapmanganato.com" + _match_chapter = None def __init__(self, match): domain, path = match.groups() super().__init__(match, "https://" + domain + path) - self.session.headers['Referer'] = self.root - self._match_chapter = re.compile( - r"(?:[Vv]ol\.?\s*(\d+)\s?)?" - r"[Cc]hapter\s*([^:]+)" - r"(?::\s*(.+))?").match + def _init(self): + self.session.headers['Referer'] = self.root + "/" + + if self._match_chapter is None: + ManganeloBase._match_chapter = re.compile( + r"(?:[Vv]ol\.?\s*(\d+)\s?)?" + r"[Cc]hapter\s*(\d+)([^:]*)" + r"(?::\s*(.+))?").match def _parse_chapter(self, info, manga, author, date=None): match = self._match_chapter(info) - volume, chapter, title = match.groups() if match else ("", "", info) - chapter, sep, minor = chapter.partition(".") + if match: + volume, chapter, minor, title = match.groups() + else: + volume = chapter = minor = "" + title = info return { "manga" : manga, @@ -39,7 +46,7 @@ class ManganeloBase(): "title" : text.unescape(title) if title else "", "volume" : text.parse_int(volume), "chapter" : text.parse_int(chapter), - "chapter_minor": sep + minor, + "chapter_minor": minor, "lang" : "en", "language" : "English", } @@ -61,6 +68,10 @@ class ManganeloChapterExtractor(ManganeloBase, ChapterExtractor): "keyword": "06e01fa9b3fc9b5b954c0d4a98f0153b40922ded", "count": 45, }), + ("https://chapmanganato.com/manga-no991297/chapter-8", { + "keyword": {"chapter": 8, "chapter_minor": "-1"}, + "count": 20, + }), ("https://readmanganato.com/manga-gn983696/chapter-23"), ("https://manganelo.com/chapter/gamers/chapter_15"), ("https://manganelo.com/chapter/gq921227/chapter_23"), diff --git a/gallery_dl/extractor/mangapark.py b/gallery_dl/extractor/mangapark.py index 168fbe84..a0d1e80b 100644 --- a/gallery_dl/extractor/mangapark.py +++ b/gallery_dl/extractor/mangapark.py @@ -8,155 +8,464 @@ """Extractors for https://mangapark.net/""" -from .common import ChapterExtractor, MangaExtractor +from .common import ChapterExtractor, Extractor, Message from .. import text, util, exception import re +BASE_PATTERN = r"(?:https?://)?(?:www\.)?mangapark\.(?:net|com|org|io|me)" + class MangaparkBase(): """Base class for mangapark extractors""" category = "mangapark" - root_fmt = "https://v2.mangapark.{}" - browser = "firefox" + _match_title = None - @staticmethod - def parse_chapter_path(path, data): - """Get volume/chapter information from url-path of a chapter""" - data["volume"], data["chapter_minor"] = 0, "" - for part in path.split("/")[1:]: - key, value = part[0], part[1:] - if key == "c": - chapter, dot, minor = value.partition(".") - data["chapter"] = text.parse_int(chapter) - data["chapter_minor"] = dot + minor - elif key == "i": - data["chapter_id"] = text.parse_int(value) - elif key == "v": - data["volume"] = text.parse_int(value) - elif key == "s": - data["stream"] = text.parse_int(value) - elif key == "e": - data["chapter_minor"] = "v" + value - - @staticmethod - def parse_chapter_title(title, data): - match = re.search(r"(?i)(?:vol(?:ume)?[ .]*(\d+) )?" - r"ch(?:apter)?[ .]*(\d+)(\.\w+)?", title) - if match: - vol, ch, data["chapter_minor"] = match.groups() - data["volume"] = text.parse_int(vol) - data["chapter"] = text.parse_int(ch) + def _parse_chapter_title(self, title): + if not self._match_title: + MangaparkBase._match_title = re.compile( + r"(?i)" + r"(?:vol(?:\.|ume)?\s*(\d+)\s*)?" + r"ch(?:\.|apter)?\s*(\d+)([^\s:]*)" + r"(?:\s*:\s*(.*))?" + ).match + match = self._match_title(title) + return match.groups() if match else (0, 0, "", "") class MangaparkChapterExtractor(MangaparkBase, ChapterExtractor): """Extractor for manga-chapters from mangapark.net""" - pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" - r"/manga/([^?#]+/i\d+)") + pattern = BASE_PATTERN + r"/title/[^/?#]+/(\d+)" test = ( - ("https://mangapark.net/manga/gosu/i811653/c055/1", { - "count": 50, - "keyword": "db1ed9af4f972756a25dbfa5af69a8f155b043ff", + ("https://mangapark.net/title/114972-aria/6710214-en-ch.60.2", { + "count": 70, + "pattern": r"https://[\w-]+\.mpcdn\.org/comic/2002/e67" + r"/61e29278a583b9227964076e/\d+_\d+_\d+_\d+\.jpeg" + r"\?acc=[^&#]+&exp=\d+", + "keyword": { + "artist": [], + "author": ["Amano Kozue"], + "chapter": 60, + "chapter_id": 6710214, + "chapter_minor": ".2", + "count": 70, + "date": "dt:2022-01-15 09:25:03", + "extension": "jpeg", + "filename": str, + "genre": ["adventure", "comedy", "drama", "sci_fi", + "shounen", "slice_of_life"], + "lang": "en", + "language": "English", + "manga": "Aria", + "manga_id": 114972, + "page": int, + "source": "Koala", + "title": "Special Navigation - Aquaria Ii", + "volume": 12, + }, }), - (("https://mangapark.net/manga" - "/ad-astra-per-aspera-hata-kenjirou/i662051/c001.2/1"), { - "count": 40, - "keyword": "2bb3a8f426383ea13f17ff5582f3070d096d30ac", - }), - (("https://mangapark.net/manga" - "/gekkan-shoujo-nozaki-kun/i2067426/v7/c70/1"), { - "count": 15, - "keyword": "edc14993c4752cee3a76e09b2f024d40d854bfd1", - }), - ("https://mangapark.me/manga/gosu/i811615/c55/1"), - ("https://mangapark.com/manga/gosu/i811615/c55/1"), + ("https://mangapark.com/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.org/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.io/title/114972-aria/6710214-en-ch.60.2"), + ("https://mangapark.me/title/114972-aria/6710214-en-ch.60.2"), ) def __init__(self, match): - tld, self.path = match.groups() - self.root = self.root_fmt.format(tld) - url = "{}/manga/{}?zoom=2".format(self.root, self.path) + self.root = text.root_from_url(match.group(0)) + url = "{}/title/_/{}".format(self.root, match.group(1)) ChapterExtractor.__init__(self, match, url) def metadata(self, page): - data = text.extract_all(page, ( - ("manga_id" , "var _manga_id = '", "'"), - ("chapter_id", "var _book_id = '", "'"), - ("stream" , "var _stream = '", "'"), - ("path" , "var _book_link = '", "'"), - ("manga" , "

", "

"), - ("title" , "
", "<"), - ), values={"lang": "en", "language": "English"})[0] + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '<')) + chapter = (data["props"]["pageProps"]["dehydratedState"] + ["queries"][0]["state"]["data"]["data"]) + manga = chapter["comicNode"]["data"] + source = chapter["sourceNode"]["data"] - if not data["path"]: - raise exception.NotFoundError("chapter") + self._urls = chapter["imageSet"]["httpLis"] + self._params = chapter["imageSet"]["wordLis"] + vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) - self.parse_chapter_path(data["path"], data) - if "chapter" not in data: - self.parse_chapter_title(data["title"], data) - - data["manga"], _, data["type"] = data["manga"].rpartition(" ") - data["manga"] = text.unescape(data["manga"]) - data["title"] = data["title"].partition(": ")[2] - for key in ("manga_id", "chapter_id", "stream"): - data[key] = text.parse_int(data[key]) - - return data + return { + "manga" : manga["name"], + "manga_id" : manga["id"], + "artist" : source["artists"], + "author" : source["authors"], + "genre" : source["genres"], + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(ch), + "chapter_minor": minor, + "chapter_id": chapter["id"], + "title" : chapter["title"] or title or "", + "lang" : chapter["lang"], + "language" : util.code_to_language(chapter["lang"]), + "source" : source["srcTitle"], + "source_id" : source["id"], + "date" : text.parse_timestamp(chapter["dateCreate"] // 1000), + } def images(self, page): - data = util.json_loads(text.extr(page, "var _load_pages =", ";")) return [ - (text.urljoin(self.root, item["u"]), { - "width": text.parse_int(item["w"]), - "height": text.parse_int(item["h"]), - }) - for item in data + (url + "?" + params, None) + for url, params in zip(self._urls, self._params) ] -class MangaparkMangaExtractor(MangaparkBase, MangaExtractor): +class MangaparkMangaExtractor(MangaparkBase, Extractor): """Extractor for manga from mangapark.net""" - chapterclass = MangaparkChapterExtractor - pattern = (r"(?:https?://)?(?:www\.|v2\.)?mangapark\.(me|net|com)" - r"(/manga/[^/?#]+)/?$") + subcategory = "manga" + pattern = BASE_PATTERN + r"/title/(\d+)(?:-[^/?#]*)?/?$" test = ( - ("https://mangapark.net/manga/aria", { - "url": "51c6d82aed5c3c78e0d3f980b09a998e6a2a83ee", - "keyword": "cabc60cf2efa82749d27ac92c495945961e4b73c", + ("https://mangapark.net/title/114972-aria", { + "count": 141, + "pattern": MangaparkChapterExtractor.pattern, + "keyword": { + "chapter": int, + "chapter_id": int, + "chapter_minor": str, + "date": "type:datetime", + "lang": "en", + "language": "English", + "manga_id": 114972, + "source": "re:Horse|Koala", + "source_id": int, + "title": str, + "volume": int, + }, }), - ("https://mangapark.me/manga/aria"), - ("https://mangapark.com/manga/aria"), + # 'source' option + ("https://mangapark.net/title/114972-aria", { + "options": (("source", "koala"),), + "count": 70, + "pattern": MangaparkChapterExtractor.pattern, + "keyword": { + "source": "Koala", + "source_id": 15150116, + }, + }), + ("https://mangapark.com/title/114972-"), + ("https://mangapark.com/title/114972"), + ("https://mangapark.com/title/114972-aria"), + ("https://mangapark.org/title/114972-aria"), + ("https://mangapark.io/title/114972-aria"), + ("https://mangapark.me/title/114972-aria"), ) def __init__(self, match): - self.root = self.root_fmt.format(match.group(1)) - MangaExtractor.__init__(self, match, self.root + match.group(2)) + self.root = text.root_from_url(match.group(0)) + self.manga_id = int(match.group(1)) + Extractor.__init__(self, match) - def chapters(self, page): - results = [] - data = {"lang": "en", "language": "English"} - data["manga"] = text.unescape( - text.extr(page, '', ' Manga - ')) + def items(self): + for chapter in self.chapters(): + chapter = chapter["data"] + url = self.root + chapter["urlPath"] - for stream in page.split('<div id="stream_')[1:]: - data["stream"] = text.parse_int(text.extr(stream, '', '"')) + vol, ch, minor, title = self._parse_chapter_title(chapter["dname"]) + data = { + "manga_id" : self.manga_id, + "volume" : text.parse_int(vol), + "chapter" : text.parse_int(ch), + "chapter_minor": minor, + "chapter_id": chapter["id"], + "title" : chapter["title"] or title or "", + "lang" : chapter["lang"], + "language" : util.code_to_language(chapter["lang"]), + "source" : chapter["srcTitle"], + "source_id" : chapter["sourceId"], + "date" : text.parse_timestamp( + chapter["dateCreate"] // 1000), + "_extractor": MangaparkChapterExtractor, + } + yield Message.Queue, url, data - for chapter in text.extract_iter(stream, '<li ', '</li>'): - path , pos = text.extract(chapter, 'href="', '"') - title1, pos = text.extract(chapter, '>', '<', pos) - title2, pos = text.extract(chapter, '>: </span>', '<', pos) - count , pos = text.extract(chapter, ' of ', ' ', pos) + def chapters(self): + source = self.config("source") + if not source: + return self.chapters_all() - self.parse_chapter_path(path[8:], data) - if "chapter" not in data: - self.parse_chapter_title(title1, data) + source_id = self._select_source(source) + self.log.debug("Requesting chapters for source_id %s", source_id) + return self.chapters_source(source_id) - if title2: - data["title"] = title2.strip() - else: - data["title"] = title1.partition(":")[2].strip() + def chapters_all(self): + pnum = 0 + variables = { + "select": { + "comicId": self.manga_id, + "range" : None, + "isAsc" : not self.config("chapter-reverse"), + } + } - data["count"] = text.parse_int(count) - results.append((self.root + path, data.copy())) - data.pop("chapter", None) + while True: + data = self._request_graphql( + "get_content_comicChapterRangeList", variables) - return results + for item in data["items"]: + yield from item["chapterNodes"] + + if not pnum: + pager = data["pager"] + pnum += 1 + + try: + variables["select"]["range"] = pager[pnum] + except IndexError: + return + + def chapters_source(self, source_id): + variables = { + "sourceId": source_id, + } + chapters = self._request_graphql( + "get_content_source_chapterList", variables) + + if self.config("chapter-reverse"): + chapters.reverse() + return chapters + + def _select_source(self, source): + if isinstance(source, int): + return source + + group, _, lang = source.partition(":") + group = group.lower() + + variables = { + "comicId" : self.manga_id, + "dbStatuss" : ["normal"], + "haveChapter": True, + } + for item in self._request_graphql( + "get_content_comic_sources", variables): + data = item["data"] + if (not group or data["srcTitle"].lower() == group) and ( + not lang or data["lang"] == lang): + return data["id"] + + raise exception.StopExtraction( + "'%s' does not match any available source", source) + + def _request_graphql(self, opname, variables): + url = self.root + "/apo/" + data = { + "query" : QUERIES[opname], + "variables" : util.json_dumps(variables), + "operationName": opname, + } + return self.request( + url, method="POST", json=data).json()["data"][opname] + + +QUERIES = { + "get_content_comicChapterRangeList": """ + query get_content_comicChapterRangeList($select: Content_ComicChapterRangeList_Select) { + get_content_comicChapterRangeList( + select: $select + ) { + reqRange{x y} + missing + pager {x y} + items{ + serial + chapterNodes { + + id + data { + + + id + sourceId + + dbStatus + isNormal + isHidden + isDeleted + isFinal + + dateCreate + datePublic + dateModify + lang + volume + serial + dname + title + urlPath + + srcTitle srcColor + + count_images + + stat_count_post_child + stat_count_post_reply + stat_count_views_login + stat_count_views_guest + + userId + userNode { + + id + data { + +id +name +uniq +avatarUrl +urlPath + +verified +deleted +banned + +dateCreate +dateOnline + +stat_count_chapters_normal +stat_count_chapters_others + +is_adm is_mod is_vip is_upr + + } + + } + + disqusId + + + } + + sser_read + } + } + + } + } +""", + + "get_content_source_chapterList": """ + query get_content_source_chapterList($sourceId: Int!) { + get_content_source_chapterList( + sourceId: $sourceId + ) { + + id + data { + + + id + sourceId + + dbStatus + isNormal + isHidden + isDeleted + isFinal + + dateCreate + datePublic + dateModify + lang + volume + serial + dname + title + urlPath + + srcTitle srcColor + + count_images + + stat_count_post_child + stat_count_post_reply + stat_count_views_login + stat_count_views_guest + + userId + userNode { + + id + data { + +id +name +uniq +avatarUrl +urlPath + +verified +deleted +banned + +dateCreate +dateOnline + +stat_count_chapters_normal +stat_count_chapters_others + +is_adm is_mod is_vip is_upr + + } + + } + + disqusId + + + } + + } + } +""", + + "get_content_comic_sources": """ + query get_content_comic_sources($comicId: Int!, $dbStatuss: [String] = [], $userId: Int, $haveChapter: Boolean, $sortFor: String) { + get_content_comic_sources( + comicId: $comicId + dbStatuss: $dbStatuss + userId: $userId + haveChapter: $haveChapter + sortFor: $sortFor + ) { + +id +data{ + + id + + dbStatus + isNormal + isHidden + isDeleted + + lang name altNames authors artists + + release + genres summary{code} extraInfo{code} + + urlCover600 + urlCover300 + urlCoverOri + + srcTitle srcColor + + chapterCount + chapterNode_last { + id + data { + dateCreate datePublic dateModify + volume serial + dname title + urlPath + userNode { + id data {uniq name} + } + } + } +} + + } + } +""", +} diff --git a/gallery_dl/extractor/mangaread.py b/gallery_dl/extractor/mangaread.py new file mode 100644 index 00000000..74c239e9 --- /dev/null +++ b/gallery_dl/extractor/mangaread.py @@ -0,0 +1,192 @@ +# -*- coding: utf-8 -*- + +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Extractors for https://mangaread.org/""" + +from .common import ChapterExtractor, MangaExtractor +from .. import text, exception +import re + + +class MangareadBase(): + """Base class for Mangaread extractors""" + category = "mangaread" + root = "https://www.mangaread.org" + + @staticmethod + def parse_chapter_string(chapter_string, data): + match = re.match( + r"(?:(.+)\s*-\s*)?[Cc]hapter\s*(\d+)(\.\d+)?(?:\s*-\s*(.+))?", + text.unescape(chapter_string).strip()) + manga, chapter, minor, title = match.groups() + manga = manga.strip() if manga else "" + data["manga"] = data.pop("manga", manga) + data["chapter"] = text.parse_int(chapter) + data["chapter_minor"] = minor or "" + data["title"] = title or "" + data["lang"] = "en" + data["language"] = "English" + + +class MangareadChapterExtractor(MangareadBase, ChapterExtractor): + """Extractor for manga-chapters from mangaread.org""" + pattern = (r"(?:https?://)?(?:www\.)?mangaread\.org" + r"(/manga/[^/?#]+/[^/?#]+)") + test = ( + ("https://www.mangaread.org/manga/one-piece/chapter-1053-3/", { + "pattern": (r"https://www\.mangaread\.org/wp-content/uploads" + r"/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+"), + "count": 11, + "keyword": { + "manga" : "One Piece", + "title" : "", + "chapter" : 1053, + "chapter_minor": ".3", + "tags" : ["Oda Eiichiro"], + "lang" : "en", + "language": "English", + } + }), + ("https://www.mangaread.org/manga/one-piece/chapter-1000000/", { + "exception": exception.NotFoundError, + }), + (("https://www.mangaread.org" + "/manga/kanan-sama-wa-akumade-choroi/chapter-10/"), { + "pattern": (r"https://www\.mangaread\.org/wp-content/uploads" + r"/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+"), + "count": 9, + "keyword": { + "manga" : "Kanan-sama wa Akumade Choroi", + "title" : "", + "chapter" : 10, + "chapter_minor": "", + "tags" : list, + "lang" : "en", + "language": "English", + } + }), + # 'Chapter146.5' + # ^^ no whitespace + ("https://www.mangaread.org/manga/above-all-gods/chapter146-5/", { + "pattern": (r"https://www\.mangaread\.org/wp-content/uploads" + r"/WP-manga/data/manga_[^/]+/[^/]+/[^.]+\.\w+"), + "count": 6, + "keyword": { + "manga" : "Above All Gods", + "title" : "", + "chapter" : 146, + "chapter_minor": ".5", + "tags" : list, + "lang" : "en", + "language": "English", + } + }), + ) + + def metadata(self, page): + tags = text.extr(page, 'class="wp-manga-tags-list">', '</div>') + data = {"tags": list(text.split_html(tags)[::2])} + info = text.extr(page, '<h1 id="chapter-heading">', "</h1>") + if not info: + raise exception.NotFoundError("chapter") + self.parse_chapter_string(info, data) + return data + + def images(self, page): + page = text.extr( + page, '<div class="reading-content">', '<div class="entry-header') + return [ + (url.strip(), None) + for url in text.extract_iter(page, 'data-src="', '"') + ] + + +class MangareadMangaExtractor(MangareadBase, MangaExtractor): + """Extractor for manga from mangaread.org""" + chapterclass = MangareadChapterExtractor + pattern = r"(?:https?://)?(?:www\.)?mangaread\.org(/manga/[^/?#]+)/?$" + test = ( + ("https://www.mangaread.org/manga/kanan-sama-wa-akumade-choroi", { + "pattern": (r"https://www\.mangaread\.org/manga" + r"/kanan-sama-wa-akumade-choroi" + r"/chapter-\d+(-.+)?/"), + "count" : ">= 13", + "keyword": { + "manga" : "Kanan-sama wa Akumade Choroi", + "author" : ["nonco"], + "artist" : ["nonco"], + "type" : "Manga", + "genres" : ["Comedy", "Romance", "Shounen", "Supernatural"], + "rating" : float, + "release": 2022, + "status" : "OnGoing", + "lang" : "en", + "language" : "English", + "manga_alt" : list, + "description": str, + } + }), + ("https://www.mangaread.org/manga/one-piece", { + "pattern": (r"https://www\.mangaread\.org/manga" + r"/one-piece/chapter-\d+(-.+)?/"), + "count" : ">= 1066", + "keyword": { + "manga" : "One Piece", + "author" : ["Oda Eiichiro"], + "artist" : ["Oda Eiichiro"], + "type" : "Manga", + "genres" : list, + "rating" : float, + "release": 1997, + "status" : "OnGoing", + "lang" : "en", + "language" : "English", + "manga_alt" : ["One Piece"], + "description": str, + } + }), + ("https://www.mangaread.org/manga/doesnotexist", { + "exception": exception.HttpError, + }), + ) + + def chapters(self, page): + if 'class="error404' in page: + raise exception.NotFoundError("manga") + data = self.metadata(page) + result = [] + for chapter in text.extract_iter( + page, '<li class="wp-manga-chapter', "</li>"): + url , pos = text.extract(chapter, '<a href="', '"') + info, _ = text.extract(chapter, ">", "</a>", pos) + self.parse_chapter_string(info, data) + result.append((url, data.copy())) + return result + + def metadata(self, page): + extr = text.extract_from(text.extr( + page, 'class="summary_content">', 'class="manga-action"')) + return { + "manga" : text.extr(page, "<h1>", "</h1>").strip(), + "description": text.unescape(text.remove_html(text.extract( + page, ">", "</div>", page.index("summary__content"))[0])), + "rating" : text.parse_float( + extr('total_votes">', "</span>").strip()), + "manga_alt" : text.remove_html( + extr("Alternative </h5>\n</div>", "</div>")).split("; "), + "author" : list(text.extract_iter( + extr('class="author-content">', "</div>"), '"tag">', "</a>")), + "artist" : list(text.extract_iter( + extr('class="artist-content">', "</div>"), '"tag">', "</a>")), + "genres" : list(text.extract_iter( + extr('class="genres-content">', "</div>"), '"tag">', "</a>")), + "type" : text.remove_html( + extr("Type </h5>\n</div>", "</div>")), + "release" : text.parse_int(text.remove_html( + extr("Release </h5>\n</div>", "</div>"))), + "status" : text.remove_html( + extr("Status </h5>\n</div>", "</div>")), + } diff --git a/gallery_dl/extractor/mangasee.py b/gallery_dl/extractor/mangasee.py index b7070f28..00c89c1e 100644 --- a/gallery_dl/extractor/mangasee.py +++ b/gallery_dl/extractor/mangasee.py @@ -90,10 +90,12 @@ class MangaseeChapterExtractor(MangaseeBase, ChapterExtractor): self.category = "mangalife" self.root = "https://manga4life.com" ChapterExtractor.__init__(self, match, self.root + match.group(2)) + + def _init(self): self.session.headers["Referer"] = self.gallery_url domain = self.root.rpartition("/")[2] - cookies = self.session.cookies + cookies = self.cookies if not cookies.get("PHPSESSID", domain=domain): cookies.set("PHPSESSID", util.generate_token(13), domain=domain) diff --git a/gallery_dl/extractor/mangoxo.py b/gallery_dl/extractor/mangoxo.py index ac4c7978..cca18b13 100644 --- a/gallery_dl/extractor/mangoxo.py +++ b/gallery_dl/extractor/mangoxo.py @@ -19,14 +19,14 @@ class MangoxoExtractor(Extractor): """Base class for mangoxo extractors""" category = "mangoxo" root = "https://www.mangoxo.com" - cookiedomain = "www.mangoxo.com" - cookienames = ("SESSION",) + cookies_domain = "www.mangoxo.com" + cookies_names = ("SESSION",) _warning = True def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) elif MangoxoExtractor._warning: MangoxoExtractor._warning = False self.log.warning("Unauthenticated users cannot see " @@ -51,7 +51,7 @@ class MangoxoExtractor(Extractor): data = response.json() if str(data.get("result")) != "1": raise exception.AuthenticationError(data.get("msg")) - return {"SESSION": self.session.cookies.get("SESSION")} + return {"SESSION": self.cookies.get("SESSION")} @staticmethod def _sign_by_md5(username, password, token): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index e190c7eb..3bed955c 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -19,12 +19,14 @@ class MastodonExtractor(BaseExtractor): directory_fmt = ("mastodon", "{instance}", "{account[username]}") filename_fmt = "{category}_{id}_{media[id]}.{extension}" archive_fmt = "{media[id]}" - cookiedomain = None + cookies_domain = None def __init__(self, match): BaseExtractor.__init__(self, match) - self.instance = self.root.partition("://")[2] self.item = match.group(match.lastindex) + + def _init(self): + self.instance = self.root.partition("://")[2] self.reblogs = self.config("reblogs", False) self.replies = self.config("replies", True) diff --git a/gallery_dl/extractor/mememuseum.py b/gallery_dl/extractor/mememuseum.py deleted file mode 100644 index 1de0d768..00000000 --- a/gallery_dl/extractor/mememuseum.py +++ /dev/null @@ -1,120 +0,0 @@ -# -*- coding: utf-8 -*- - -# Copyright 2022 Mike Fährmann -# -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://meme.museum/""" - -from .common import Extractor, Message -from .. import text - - -class MememuseumExtractor(Extractor): - """Base class for meme.museum extractors""" - basecategory = "booru" - category = "mememuseum" - filename_fmt = "{category}_{id}_{md5}.{extension}" - archive_fmt = "{id}" - root = "https://meme.museum" - - def items(self): - data = self.metadata() - - for post in self.posts(): - url = post["file_url"] - for key in ("id", "width", "height"): - post[key] = text.parse_int(post[key]) - post["tags"] = text.unquote(post["tags"]) - post.update(data) - yield Message.Directory, post - yield Message.Url, url, text.nameext_from_url(url, post) - - def metadata(self): - """Return general metadata""" - return () - - def posts(self): - """Return an iterable containing data of all relevant posts""" - return () - - -class MememuseumTagExtractor(MememuseumExtractor): - """Extractor for images from meme.museum by search-tags""" - subcategory = "tag" - directory_fmt = ("{category}", "{search_tags}") - pattern = r"(?:https?://)?meme\.museum/post/list/([^/?#]+)" - test = ("https://meme.museum/post/list/animated/1", { - "pattern": r"https://meme\.museum/_images/\w+/\d+%20-%20", - "count": ">= 30" - }) - per_page = 25 - - def __init__(self, match): - MememuseumExtractor.__init__(self, match) - self.tags = text.unquote(match.group(1)) - - def metadata(self): - return {"search_tags": self.tags} - - def posts(self): - pnum = 1 - while True: - url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) - extr = text.extract_from(self.request(url).text) - - while True: - mime = extr("data-mime='", "'") - if not mime: - break - - pid = extr("data-post-id='", "'") - tags, dimensions, size = extr("title='", "'").split(" // ") - md5 = extr("/_thumbs/", "/") - width, _, height = dimensions.partition("x") - - yield { - "file_url": "{}/_images/{}/{}%20-%20{}.{}".format( - self.root, md5, pid, text.quote(tags), - mime.rpartition("/")[2]), - "id": pid, "md5": md5, "tags": tags, - "width": width, "height": height, - "size": text.parse_bytes(size[:-1]), - } - - if not extr(">Next<", ">"): - return - pnum += 1 - - -class MememuseumPostExtractor(MememuseumExtractor): - """Extractor for single images from meme.museum""" - subcategory = "post" - pattern = r"(?:https?://)?meme\.museum/post/view/(\d+)" - test = ("https://meme.museum/post/view/10243", { - "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc4997" - r"1f78/10243%20-%20g%20beard%20open_source%20richard_stallm" - r"an%20stallman%20tagme%20text\.jpg", - "keyword": "3c8009251480cf17248c08b2b194dc0c4d59580e", - "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", - }) - - def __init__(self, match): - MememuseumExtractor.__init__(self, match) - self.post_id = match.group(1) - - def posts(self): - url = "{}/post/view/{}".format(self.root, self.post_id) - extr = text.extract_from(self.request(url).text) - - return ({ - "id" : self.post_id, - "tags" : extr(": ", "<"), - "md5" : extr("/_thumbs/", "/"), - "file_url": self.root + extr("id='main_image' src='", "'"), - "width" : extr("data-width=", " ").strip("'\""), - "height" : extr("data-height=", " ").strip("'\""), - "size" : 0, - },) diff --git a/gallery_dl/extractor/misskey.py b/gallery_dl/extractor/misskey.py index 03e91045..8c717581 100644 --- a/gallery_dl/extractor/misskey.py +++ b/gallery_dl/extractor/misskey.py @@ -7,7 +7,7 @@ """Extractors for Misskey instances""" from .common import BaseExtractor, Message -from .. import text +from .. import text, exception class MisskeyExtractor(BaseExtractor): @@ -19,14 +19,18 @@ class MisskeyExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) + self.item = match.group(match.lastindex) + + def _init(self): self.api = MisskeyAPI(self) self.instance = self.root.rpartition("://")[2] - self.item = match.group(match.lastindex) self.renotes = self.config("renotes", False) self.replies = self.config("replies", True) def items(self): for note in self.notes(): + if "note" in note: + note = note["note"] files = note.pop("files") or [] renote = note.get("renote") if renote: @@ -68,7 +72,7 @@ BASE_PATTERN = MisskeyExtractor.update({ }, "lesbian.energy": { "root": "https://lesbian.energy", - "pattern": r"lesbian\.energy" + "pattern": r"lesbian\.energy", }, "sushi.ski": { "root": "https://sushi.ski", @@ -152,6 +156,21 @@ class MisskeyNoteExtractor(MisskeyExtractor): return (self.api.notes_show(self.item),) +class MisskeyFavoriteExtractor(MisskeyExtractor): + """Extractor for favorited notes""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/(?:my|api/i)/favorites" + test = ( + ("https://misskey.io/my/favorites"), + ("https://misskey.io/api/i/favorites"), + ("https://lesbian.energy/my/favorites"), + ("https://sushi.ski/my/favorites"), + ) + + def notes(self): + return self.api.i_favorites() + + class MisskeyAPI(): """Interface for Misskey API @@ -164,6 +183,7 @@ class MisskeyAPI(): self.root = extractor.root self.extractor = extractor self.headers = {"Content-Type": "application/json"} + self.access_token = extractor.config("access-token") def user_id_by_username(self, username): endpoint = "/users/show" @@ -187,6 +207,13 @@ class MisskeyAPI(): data = {"noteId": note_id} return self._call(endpoint, data) + def i_favorites(self): + endpoint = "/i/favorites" + if not self.access_token: + raise exception.AuthenticationError() + data = {"i": self.access_token} + return self._pagination(endpoint, data) + def _call(self, endpoint, data): url = self.root + "/api" + endpoint return self.extractor.request( diff --git a/gallery_dl/extractor/moebooru.py b/gallery_dl/extractor/moebooru.py index 0ef0a328..1e56bde9 100644 --- a/gallery_dl/extractor/moebooru.py +++ b/gallery_dl/extractor/moebooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -166,7 +166,7 @@ class MoebooruTagExtractor(MoebooruExtractor): subcategory = "tag" directory_fmt = ("{category}", "{search_tags}") archive_fmt = "t_{search_tags}_{id}" - pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]+)" + pattern = BASE_PATTERN + r"/post\?(?:[^&#]*&)*tags=([^&#]*)" test = ( ("https://yande.re/post?tags=ouzoku+armor", { "content": "59201811c728096b2d95ce6896fd0009235fe683", @@ -174,6 +174,8 @@ class MoebooruTagExtractor(MoebooruExtractor): ("https://konachan.com/post?tags=patata", { "content": "838cfb815e31f48160855435655ddf7bfc4ecb8d", }), + # empty 'tags' (#4354) + ("https://konachan.com/post?tags="), ("https://konachan.net/post?tags=patata"), ("https://www.sakugabooru.com/post?tags=nichijou"), ("https://lolibooru.moe/post?tags=ruu_%28tksymkw%29"), diff --git a/gallery_dl/extractor/myhentaigallery.py b/gallery_dl/extractor/myhentaigallery.py index 5dc4cb60..3301da97 100644 --- a/gallery_dl/extractor/myhentaigallery.py +++ b/gallery_dl/extractor/myhentaigallery.py @@ -38,7 +38,9 @@ class MyhentaigalleryGalleryExtractor(GalleryExtractor): self.gallery_id = match.group(1) url = "{}/gallery/thumbnails/{}".format(self.root, self.gallery_id) GalleryExtractor.__init__(self, match, url) - self.session.headers["Referer"] = url + + def _init(self): + self.session.headers["Referer"] = self.gallery_url def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/myportfolio.py b/gallery_dl/extractor/myportfolio.py index 7d23518b..fd16f247 100644 --- a/gallery_dl/extractor/myportfolio.py +++ b/gallery_dl/extractor/myportfolio.py @@ -1,12 +1,12 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2022 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extract images from https://www.myportfolio.com/""" +"""Extractors for https://www.myportfolio.com/""" from .common import Extractor, Message from .. import text, exception @@ -21,7 +21,7 @@ class MyportfolioGalleryExtractor(Extractor): archive_fmt = "{user}_{filename}" pattern = (r"(?:myportfolio:(?:https?://)?([^/]+)|" r"(?:https?://)?([\w-]+\.myportfolio\.com))" - r"(/[^/?&#]+)?") + r"(/[^/?#]+)?") test = ( ("https://andrewling.myportfolio.com/volvo-xc-90-hybrid", { "url": "acea0690c76db0e5cf267648cefd86e921bc3499", diff --git a/gallery_dl/extractor/nana.py b/gallery_dl/extractor/nana.py deleted file mode 100644 index 24e676fe..00000000 --- a/gallery_dl/extractor/nana.py +++ /dev/null @@ -1,118 +0,0 @@ -# -*- coding: utf-8 -*- - -# This program is free software; you can redistribute it and/or modify -# it under the terms of the GNU General Public License version 2 as -# published by the Free Software Foundation. - -"""Extractors for https://nana.my.id/""" - -from .common import GalleryExtractor, Extractor, Message -from .. import text, util, exception - - -class NanaGalleryExtractor(GalleryExtractor): - """Extractor for image galleries from nana.my.id""" - category = "nana" - directory_fmt = ("{category}", "{title}") - pattern = r"(?:https?://)?nana\.my\.id/reader/([^/?#]+)" - test = ( - (("https://nana.my.id/reader/" - "059f7de55a4297413bfbd432ce7d6e724dd42bae"), { - "pattern": r"https://nana\.my\.id/reader/" - r"\w+/image/page\?path=.*\.\w+", - "keyword": { - "title" : "Everybody Loves Shion", - "artist": "fuzui", - "tags" : list, - "count" : 29, - }, - }), - (("https://nana.my.id/reader/" - "77c8712b67013e427923573379f5bafcc0c72e46"), { - "pattern": r"https://nana\.my\.id/reader/" - r"\w+/image/page\?path=.*\.\w+", - "keyword": { - "title" : "Lovey-Dovey With an Otaku-Friendly Gyaru", - "artist": "Sueyuu", - "tags" : ["Sueyuu"], - "count" : 58, - }, - }), - ) - - def __init__(self, match): - self.gallery_id = match.group(1) - url = "https://nana.my.id/reader/" + self.gallery_id - GalleryExtractor.__init__(self, match, url) - - def metadata(self, page): - title = text.unescape( - text.extr(page, '</a>  ', '</div>')) - artist = text.unescape(text.extr( - page, '<title>', ''))[len(title):-10] - tags = text.extr(page, 'Reader.tags = "', '"') - - return { - "gallery_id": self.gallery_id, - "title" : title, - "artist" : artist[4:] if artist.startswith(" by ") else "", - "tags" : tags.split(", ") if tags else (), - "lang" : "en", - "language" : "English", - } - - def images(self, page): - data = util.json_loads(text.extr(page, "Reader.pages = ", ".pages")) - return [ - ("https://nana.my.id" + image, None) - for image in data["pages"] - ] - - -class NanaSearchExtractor(Extractor): - """Extractor for nana search results""" - category = "nana" - subcategory = "search" - pattern = r"(?:https?://)?nana\.my\.id(?:/?\?([^#]+))" - test = ( - ('https://nana.my.id/?q=+"elf"&sort=desc', { - "pattern": NanaGalleryExtractor.pattern, - "range": "1-100", - "count": 100, - }), - ("https://nana.my.id/?q=favorites%3A", { - "pattern": NanaGalleryExtractor.pattern, - "count": ">= 2", - }), - ) - - def __init__(self, match): - Extractor.__init__(self, match) - self.params = text.parse_query(match.group(1)) - self.params["p"] = text.parse_int(self.params.get("p"), 1) - self.params["q"] = self.params.get("q") or "" - - def items(self): - if "favorites:" in self.params["q"]: - favkey = self.config("favkey") - if not favkey: - raise exception.AuthenticationError( - "'Favorite key' not provided. " - "Please see 'https://nana.my.id/tutorial'") - self.session.cookies.set("favkey", favkey, domain="nana.my.id") - - data = {"_extractor": NanaGalleryExtractor} - while True: - try: - page = self.request( - "https://nana.my.id", params=self.params).text - except exception.HttpError: - return - - for gallery in text.extract_iter( - page, '
', '
'): - url = "https://nana.my.id" + text.extr( - gallery, '", "<")) def login(self): - """Login and obtain session cookies""" - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - self._update_cookies(self._login_impl(username, password)) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + self.cookies_update(self._login_impl(username, password)) @cache(maxage=90*24*3600, keyarg=1) def _login_impl(self, username, password): @@ -139,7 +141,7 @@ class NijieExtractor(AsynchronousMixin, BaseExtractor): response = self.request(url, method="POST", data=data) if "/login.php" in response.text: raise exception.AuthenticationError() - return self.session.cookies + return self.cookies def _pagination(self, path): url = "{}/{}.php".format(self.root, path) @@ -172,13 +174,16 @@ BASE_PATTERN = NijieExtractor.update({ class NijieUserExtractor(NijieExtractor): """Extractor for nijie user profiles""" subcategory = "user" - cookiedomain = None + cookies_domain = None pattern = BASE_PATTERN + r"/members\.php\?id=(\d+)" test = ( ("https://nijie.info/members.php?id=44"), ("https://horne.red/members.php?id=58000"), ) + def initialize(self): + pass + def items(self): fmt = "{}/{{}}.php?id={}".format(self.root, self.user_id).format return self._dispatch_extractors(( diff --git a/gallery_dl/extractor/nitter.py b/gallery_dl/extractor/nitter.py index 5f4ceeaf..fda169d8 100644 --- a/gallery_dl/extractor/nitter.py +++ b/gallery_dl/extractor/nitter.py @@ -21,7 +21,7 @@ class NitterExtractor(BaseExtractor): archive_fmt = "{tweet_id}_{num}" def __init__(self, match): - self.cookiedomain = self.root.partition("://")[2] + self.cookies_domain = self.root.partition("://")[2] BaseExtractor.__init__(self, match) lastindex = match.lastindex @@ -35,7 +35,7 @@ class NitterExtractor(BaseExtractor): if videos: ytdl = (videos == "ytdl") videos = True - self._cookiejar.set("hlsPlayback", "on", domain=self.cookiedomain) + self.cookies.set("hlsPlayback", "on", domain=self.cookies_domain) for tweet in self.tweets(): @@ -162,7 +162,11 @@ class NitterExtractor(BaseExtractor): banner = extr('class="profile-banner">Source Link<", ""), "href='", "'")[0], + "source" : text.unescape(text.extr( + extr(">Source Link<", ""), "href='", "'")), } dimensions, size, ext = extr("Info", ">").split(" // ") @@ -74,16 +74,41 @@ class PahealTagExtractor(PahealExtractor): directory_fmt = ("{category}", "{search_tags}") pattern = (r"(?:https?://)?(?:rule34|rule63|cosplay)\.paheal\.net" r"/post/list/([^/?#]+)") - test = ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { - "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", - "count": ">= 15" - }) + test = ( + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "pattern": r"https://[^.]+\.paheal\.net/_images/\w+/\d+%20-%20", + "count": ">= 15" + }), + ("https://rule34.paheal.net/post/list/Ayane_Suzuki/1", { + "range": "1", + "options": (("metadata", True),), + "keyword": { + "date": "dt:2018-01-07 07:04:05", + "duration": 0.0, + "extension": "jpg", + "filename": "2446128 - Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "height": 768, + "id": 2446128, + "md5": "b0ceda9d860df1d15b60293a7eb465c1", + "search_tags": "Ayane_Suzuki", + "size": 205312, + "source": "https://www.pixiv.net/member_illust.php" + "?mode=medium&illust_id=19957280", + "tags": "Ayane_Suzuki Idolmaster " + "idolmaster_dearly_stars Zanzi", + "uploader": "XXXname", + "width": 1024, + }, + }), + ) per_page = 70 def __init__(self, match): PahealExtractor.__init__(self, match) self.tags = text.unquote(match.group(1)) + def _init(self): if self.config("metadata"): self._extract_data = self._extract_data_ex @@ -96,8 +121,9 @@ class PahealTagExtractor(PahealExtractor): url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) page = self.request(url).text + pos = page.find("id='image-list'") for post in text.extract_iter( - page, '02}.{extension}") archive_fmt = "{id}" - cookiedomain = "www.pillowfort.social" + cookies_domain = "www.pillowfort.social" def __init__(self, match): Extractor.__init__(self, match) @@ -82,15 +82,14 @@ class PillowfortExtractor(Extractor): yield msgtype, url, post def login(self): - cget = self.session.cookies.get - if cget("_Pf_new_session", domain=self.cookiedomain) \ - or cget("remember_user_token", domain=self.cookiedomain): + if self.cookies.get("_Pf_new_session", domain=self.cookies_domain): + return + if self.cookies.get("remember_user_token", domain=self.cookies_domain): return username, password = self._get_auth_info() if username: - cookies = self._login_impl(username, password) - self._update_cookies(cookies) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=14*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/pinterest.py b/gallery_dl/extractor/pinterest.py index 31ddbcc8..be30705b 100644 --- a/gallery_dl/extractor/pinterest.py +++ b/gallery_dl/extractor/pinterest.py @@ -23,12 +23,10 @@ class PinterestExtractor(Extractor): archive_fmt = "{id}{media_id}" root = "https://www.pinterest.com" - def __init__(self, match): - Extractor.__init__(self, match) - + def _init(self): domain = self.config("domain") if not domain or domain == "auto" : - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.url) else: self.root = text.ensure_http_scheme(domain) @@ -112,7 +110,7 @@ class PinterestExtractor(Extractor): class PinterestPinExtractor(PinterestExtractor): """Extractor for images from a single pin from pinterest.com""" subcategory = "pin" - pattern = BASE_PATTERN + r"/pin/([^/?#&]+)(?!.*#related$)" + pattern = BASE_PATTERN + r"/pin/([^/?#]+)(?!.*#related$)" test = ( ("https://www.pinterest.com/pin/858146903966145189/", { "url": "afb3c26719e3a530bb0e871c480882a801a4e8a5", @@ -121,7 +119,7 @@ class PinterestPinExtractor(PinterestExtractor): }), # video pin (#1189) ("https://www.pinterest.com/pin/422564377542934214/", { - "pattern": r"https://v\.pinimg\.com/videos/mc/hls/d7/22/ff" + "pattern": r"https://v\d*\.pinimg\.com/videos/mc/hls/d7/22/ff" r"/d722ff00ab2352981b89974b37909de8.m3u8", }), ("https://www.pinterest.com/pin/858146903966145188/", { @@ -147,8 +145,8 @@ class PinterestBoardExtractor(PinterestExtractor): subcategory = "board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}") archive_fmt = "{board[id]}_{id}" - pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#&]+)" - "/(?!_saved|_created|pins/)([^/?#&]+)/?$") + pattern = (BASE_PATTERN + r"/(?!pin/)([^/?#]+)" + "/(?!_saved|_created|pins/)([^/?#]+)/?$") test = ( ("https://www.pinterest.com/g1952849/test-/", { "pattern": r"https://i\.pinimg\.com/originals/", @@ -198,7 +196,7 @@ class PinterestBoardExtractor(PinterestExtractor): class PinterestUserExtractor(PinterestExtractor): """Extractor for a user's boards""" subcategory = "user" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)(?:/_saved)?/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)(?:/_saved)?/?$" test = ( ("https://www.pinterest.com/g1952849/", { "pattern": PinterestBoardExtractor.pattern, @@ -223,7 +221,7 @@ class PinterestAllpinsExtractor(PinterestExtractor): """Extractor for a user's 'All Pins' feed""" subcategory = "allpins" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/pins/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/pins/?$" test = ("https://www.pinterest.com/g1952849/pins/", { "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.\w{3}", @@ -245,10 +243,10 @@ class PinterestCreatedExtractor(PinterestExtractor): """Extractor for a user's created pins""" subcategory = "created" directory_fmt = ("{category}", "{user}") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/_created/?$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/_created/?$" test = ("https://www.pinterest.de/digitalmomblog/_created/", { "pattern": r"https://i\.pinimg\.com/originals/[0-9a-f]{2}" - r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.jpg", + r"/[0-9a-f]{2}/[0-9a-f]{2}/[0-9a-f]{32}\.(jpg|png)", "count": 10, "range": "1-10", }) @@ -270,7 +268,7 @@ class PinterestSectionExtractor(PinterestExtractor): directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "{section[title]}") archive_fmt = "{board[id]}_{id}" - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/([^/?#&]+)" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/([^/?#]+)" test = ("https://www.pinterest.com/g1952849/stuff/section", { "count": 2, }) @@ -321,7 +319,7 @@ class PinterestRelatedPinExtractor(PinterestPinExtractor): """Extractor for related pins of another pin from pinterest.com""" subcategory = "related-pin" directory_fmt = ("{category}", "related {original_pin[id]}") - pattern = BASE_PATTERN + r"/pin/([^/?#&]+).*#related$" + pattern = BASE_PATTERN + r"/pin/([^/?#]+).*#related$" test = ("https://www.pinterest.com/pin/858146903966145189/#related", { "range": "31-70", "count": 40, @@ -340,7 +338,7 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): subcategory = "related-board" directory_fmt = ("{category}", "{board[owner][username]}", "{board[name]}", "related") - pattern = BASE_PATTERN + r"/(?!pin/)([^/?#&]+)/([^/?#&]+)/?#related$" + pattern = BASE_PATTERN + r"/(?!pin/)([^/?#]+)/([^/?#]+)/?#related$" test = ("https://www.pinterest.com/g1952849/test-/#related", { "range": "31-70", "count": 40, @@ -348,13 +346,13 @@ class PinterestRelatedBoardExtractor(PinterestBoardExtractor): }) def pins(self): - return self.api.board_related(self.board["id"]) + return self.api.board_content_recommendation(self.board["id"]) class PinterestPinitExtractor(PinterestExtractor): """Extractor for images from a pin.it URL""" subcategory = "pinit" - pattern = r"(?:https?://)?pin\.it/([^/?#&]+)" + pattern = r"(?:https?://)?pin\.it/([^/?#]+)" test = ( ("https://pin.it/Hvt8hgT", { @@ -370,7 +368,7 @@ class PinterestPinitExtractor(PinterestExtractor): self.shortened_id = match.group(1) def items(self): - url = "https://api.pinterest.com/url_shortener/{}/redirect".format( + url = "https://api.pinterest.com/url_shortener/{}/redirect/".format( self.shortened_id) response = self.request(url, method="HEAD", allow_redirects=False) location = response.headers.get("Location") @@ -458,10 +456,10 @@ class PinterestAPI(): options = {"section_id": section_id} return self._pagination("BoardSectionPins", options) - def board_related(self, board_id): + def board_content_recommendation(self, board_id): """Yield related pins of a specific board""" - options = {"board_id": board_id, "add_vase": True} - return self._pagination("BoardRelatedPixieFeed", options) + options = {"id": board_id, "type": "board", "add_vase": True} + return self._pagination("BoardContentRecommendation", options) def user_pins(self, user): """Yield all pins from 'user'""" diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index a17518fe..ffe8030f 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -15,6 +15,9 @@ from datetime import datetime, timedelta import itertools import hashlib +BASE_PATTERN = r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" +USER_PATTERN = BASE_PATTERN + r"/(?:en/)?users/(\d+)" + class PixivExtractor(Extractor): """Base class for pixiv extractors""" @@ -23,10 +26,9 @@ class PixivExtractor(Extractor): directory_fmt = ("{category}", "{user[id]} {user[account]}") filename_fmt = "{id}_p{num}.{extension}" archive_fmt = "{id}{suffix}.{extension}" - cookiedomain = None + cookies_domain = None - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.api = PixivAppAPI(self) self.load_ugoira = self.config("ugoira", True) self.max_posts = self.config("max-posts", 0) @@ -44,6 +46,8 @@ class PixivExtractor(Extractor): def transform_tags(work): work["tags"] = [tag["name"] for tag in work["tags"]] + url_sanity = ("https://s.pximg.net/common/images" + "/limit_sanity_level_360.png") ratings = {0: "General", 1: "R-18", 2: "R-18G"} meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") @@ -99,6 +103,10 @@ class PixivExtractor(Extractor): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] + if url == url_sanity: + self.log.debug("Skipping 'sanity_level' warning (%s)", + work["id"]) + continue work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) @@ -150,7 +158,7 @@ class PixivExtractor(Extractor): class PixivUserExtractor(PixivExtractor): """Extractor for a pixiv user profile""" subcategory = "user" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?u(?:sers)?/|member\.php\?id=|(?:mypage\.php)?#id=" r")(\d+)(?:$|[?#])") test = ( @@ -165,20 +173,25 @@ class PixivUserExtractor(PixivExtractor): PixivExtractor.__init__(self, match) self.user_id = match.group(1) + def initialize(self): + pass + def items(self): base = "{}/users/{}/".format(self.root, self.user_id) return self._dispatch_extractors(( - (PixivAvatarExtractor , base + "avatar"), - (PixivBackgroundExtractor, base + "background"), - (PixivArtworksExtractor , base + "artworks"), - (PixivFavoriteExtractor , base + "bookmarks/artworks"), + (PixivAvatarExtractor , base + "avatar"), + (PixivBackgroundExtractor , base + "background"), + (PixivArtworksExtractor , base + "artworks"), + (PixivFavoriteExtractor , base + "bookmarks/artworks"), + (PixivNovelBookmarkExtractor, base + "bookmarks/novels"), + (PixivNovelUserExtractor , base + "novels"), ), ("artworks",)) class PixivArtworksExtractor(PixivExtractor): """Extractor for artworks of a pixiv user""" subcategory = "artworks" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:" + pattern = (BASE_PATTERN + r"/(?:" r"(?:en/)?users/(\d+)/(?:artworks|illustrations|manga)" r"(?:/([^/?#]+))?/?(?:$|[?#])" r"|member_illust\.php\?id=(\d+)(?:&([^#]+))?)") @@ -239,8 +252,7 @@ class PixivAvatarExtractor(PixivExtractor): subcategory = "avatar" filename_fmt = "avatar{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "avatar_{user[id]}_{date}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/avatar") + pattern = USER_PATTERN + r"/avatar" test = ("https://www.pixiv.net/en/users/173530/avatar", { "content": "4e57544480cc2036ea9608103e8f024fa737fe66", }) @@ -260,8 +272,7 @@ class PixivBackgroundExtractor(PixivExtractor): subcategory = "background" filename_fmt = "background{date:?_//%Y-%m-%d}.{extension}" archive_fmt = "background_{user[id]}_{date}" - pattern = (r"(?:https?://)?(?:www\.)?pixiv\.net" - r"/(?:en/)?users/(\d+)/background") + pattern = USER_PATTERN + "/background" test = ("https://www.pixiv.net/en/users/194921/background", { "pattern": r"https://i\.pximg\.net/background/img/2021/01/30/16/12/02" r"/194921_af1f71e557a42f499213d4b9eaccc0f8\.jpg", @@ -375,12 +386,12 @@ class PixivWorkExtractor(PixivExtractor): class PixivFavoriteExtractor(PixivExtractor): - """Extractor for all favorites/bookmarks of a pixiv-user""" + """Extractor for all favorites/bookmarks of a pixiv user""" subcategory = "favorite" directory_fmt = ("{category}", "bookmarks", "{user_bookmark[id]} {user_bookmark[account]}") archive_fmt = "f_{user_bookmark[id]}_{id}{num}.{extension}" - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net/(?:(?:en/)?" + pattern = (BASE_PATTERN + r"/(?:(?:en/)?" r"users/(\d+)/(bookmarks/artworks|following)(?:/([^/?#]+))?" r"|bookmark\.php)(?:\?([^#]*))?") test = ( @@ -483,8 +494,7 @@ class PixivRankingExtractor(PixivExtractor): archive_fmt = "r_{ranking[mode]}_{ranking[date]}_{id}{num}.{extension}" directory_fmt = ("{category}", "rankings", "{ranking[mode]}", "{ranking[date]}") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/ranking\.php(?:\?([^#]*))?") + pattern = BASE_PATTERN + r"/ranking\.php(?:\?([^#]*))?" test = ( ("https://www.pixiv.net/ranking.php?mode=daily&date=20170818"), ("https://www.pixiv.net/ranking.php"), @@ -549,8 +559,7 @@ class PixivSearchExtractor(PixivExtractor): subcategory = "search" archive_fmt = "s_{search[word]}_{id}{num}.{extension}" directory_fmt = ("{category}", "search", "{search[word]}") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" + pattern = (BASE_PATTERN + r"/(?:(?:en/)?tags/([^/?#]+)(?:/[^/?#]+)?/?" r"|search\.php)(?:\?([^#]+))?") test = ( ("https://www.pixiv.net/en/tags/Original", { @@ -596,6 +605,9 @@ class PixivSearchExtractor(PixivExtractor): sort_map = { "date": "date_asc", "date_d": "date_desc", + "popular_d": "popular_desc", + "popular_male_d": "popular_male_desc", + "popular_female_d": "popular_female_desc", } try: self.sort = sort = sort_map[sort] @@ -630,8 +642,7 @@ class PixivFollowExtractor(PixivExtractor): subcategory = "follow" archive_fmt = "F_{user_follow[id]}_{id}{num}.{extension}" directory_fmt = ("{category}", "following") - pattern = (r"(?:https?://)?(?:www\.|touch\.)?pixiv\.net" - r"/bookmark_new_illust\.php") + pattern = BASE_PATTERN + r"/bookmark_new_illust\.php" test = ( ("https://www.pixiv.net/bookmark_new_illust.php"), ("https://touch.pixiv.net/bookmark_new_illust.php"), @@ -670,7 +681,7 @@ class PixivPixivisionExtractor(PixivExtractor): def works(self): return ( - self.api.illust_detail(illust_id) + self.api.illust_detail(illust_id.partition("?")[0]) for illust_id in util.unique_sequence(text.extract_iter( self.page, '', '")[2]), "description": text.unescape(extr( - 'class="IllustItemDesc" >', '<')), + 'class="IllustItemDesc" >', '')), "_http_headers": {"Referer": post_url}, } @@ -76,11 +76,12 @@ class PoipikuExtractor(Extractor): "MD" : "0", "TWF": "-1", } - page = self.request( - url, method="POST", headers=headers, data=data).json()["html"] + resp = self.request( + url, method="POST", headers=headers, data=data).json() - if page.startswith(("You need to", "Password is incorrect")): - self.log.warning("'%s'", page) + page = resp["html"] + if (resp.get("result_num") or 0) < 0: + self.log.warning("'%s'", page.replace("
", " ")) for thumb in text.extract_iter( page, 'class="IllustItemThumbImg" src="', '"'): @@ -172,7 +173,9 @@ class PoipikuPostExtractor(PoipikuExtractor): "count": 3, "keyword": { "count": "3", - "description": "ORANGE OASISボスネタバレ", + "description": "ORANGE OASISボスネタバレ
曲も大好き
" + "2枚目以降はほとんど見えなかった1枚目背景" + "のヒエログリフ小ネタです𓀀", "num": int, "post_category": "SPOILER", "post_id": "5776587", diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index f8497c09..d3619da6 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2019-2021 Mike Fährmann +# Copyright 2019-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -11,7 +11,6 @@ from .common import Extractor, Message from .. import text, exception - BASE_PATTERN = r"(?:https?://)?(?:[\w-]+\.)?pornhub\.com" @@ -59,6 +58,9 @@ class PornhubGalleryExtractor(PornhubExtractor): self._first = None def items(self): + self.cookies.set( + "accessAgeDisclaimerPH", "1", domain=".pornhub.com") + data = self.metadata() yield Message.Directory, data for num, image in enumerate(self.images(), 1): @@ -109,7 +111,7 @@ class PornhubGalleryExtractor(PornhubExtractor): "views" : text.parse_int(img["times_viewed"]), "score" : text.parse_int(img["vote_percent"]), } - key = img["next"] + key = str(img["next"]) if key == end: return @@ -146,10 +148,20 @@ class PornhubUserExtractor(PornhubExtractor): data = {"_extractor": PornhubGalleryExtractor} while True: - page = self.request( - url, method="POST", headers=headers, params=params).text - if not page: - return - for gid in text.extract_iter(page, 'id="albumphoto', '"'): + response = self.request( + url, method="POST", headers=headers, params=params, + allow_redirects=False) + + if 300 <= response.status_code < 400: + url = "{}{}/photos/{}/ajax".format( + self.root, response.headers["location"], + self.cat or "public") + continue + + gid = None + for gid in text.extract_iter(response.text, 'id="albumphoto', '"'): yield Message.Queue, self.root + "/album/" + gid, data + if gid is None: + return + params["page"] += 1 diff --git a/gallery_dl/extractor/pornpics.py b/gallery_dl/extractor/pornpics.py index 783f3da9..929e0f58 100644 --- a/gallery_dl/extractor/pornpics.py +++ b/gallery_dl/extractor/pornpics.py @@ -23,7 +23,9 @@ class PornpicsExtractor(Extractor): def __init__(self, match): super().__init__(match) self.item = match.group(1) - self.session.headers["Referer"] = self.root + + def _init(self): + self.session.headers["Referer"] = self.root + "/" def items(self): for gallery in self.galleries(): diff --git a/gallery_dl/extractor/reactor.py b/gallery_dl/extractor/reactor.py index 1800b68d..ba571bbd 100644 --- a/gallery_dl/extractor/reactor.py +++ b/gallery_dl/extractor/reactor.py @@ -22,18 +22,21 @@ class ReactorExtractor(BaseExtractor): def __init__(self, match): BaseExtractor.__init__(self, match) + url = text.ensure_http_scheme(match.group(0), "http://") pos = url.index("/", 10) - - self.root, self.path = url[:pos], url[pos:] - self.session.headers["Referer"] = self.root - self.gif = self.config("gif", False) + self.root = url[:pos] + self.path = url[pos:] if self.category == "reactor": # set category based on domain name netloc = urllib.parse.urlsplit(self.root).netloc self.category = netloc.rpartition(".")[0] + def _init(self): + self.session.headers["Referer"] = self.root + self.gif = self.config("gif", False) + def items(self): data = self.metadata() yield Message.Directory, data diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index c924e0a3..c68068cb 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -57,8 +57,10 @@ class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor): def __init__(self, match): ChapterExtractor.__init__(self, match) + self.params = match.group(2) - params = text.parse_query(match.group(2)) + def _init(self): + params = text.parse_query(self.params) quality = self.config("quality") if quality is None or quality == "auto": diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2e2583db..05da7f4a 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -19,7 +19,8 @@ class RedditExtractor(Extractor): directory_fmt = ("{category}", "{subreddit}") filename_fmt = "{id}{num:? //>02} {title[:220]}.{extension}" archive_fmt = "{filename}" - cookiedomain = ".reddit.com" + cookies_domain = ".reddit.com" + request_interval = 0.6 def items(self): self.api = RedditAPI(self) @@ -55,21 +56,29 @@ class RedditExtractor(Extractor): visited.add(submission["id"]) submission["num"] = 0 - url = submission["url"] + if "crosspost_parent_list" in submission: + try: + media = submission["crosspost_parent_list"][-1] + except Exception: + media = submission + else: + media = submission + + url = media["url"] if url and url.startswith("https://i.redd.it/"): text.nameext_from_url(url, submission) yield Message.Url, url, submission - elif "gallery_data" in submission: + elif "gallery_data" in media: for submission["num"], url in enumerate( - self._extract_gallery(submission), 1): + self._extract_gallery(media), 1): text.nameext_from_url(url, submission) yield Message.Url, url, submission - elif submission["is_video"]: + elif media["is_video"]: if videos: text.nameext_from_url(url, submission) - url = "ytdl:" + self._extract_video(submission) + url = "ytdl:" + self._extract_video(media) yield Message.Url, url, submission elif not submission["is_self"]: @@ -280,14 +289,19 @@ class RedditSubmissionExtractor(RedditExtractor): ("https://www.reddit.com/r/kpopfap/comments/qjj04q/", { "count": 0, }), - ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), - ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), - ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), - ("https://redd.it/2a00np/"), + # user page submission (#2301) ("https://www.reddit.com/user/TheSpiritTree/comments/srilyf/", { "pattern": r"https://i.redd.it/8fpgv17yqlh81.jpg", "count": 1, }), + # cross-posted video (#887, #3586, #3976) + ("https://www.reddit.com/r/kittengifs/comments/12m0b8d", { + "pattern": r"ytdl:https://v\.redd\.it/cvabpjacrvta1", + }), + ("https://old.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://np.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://m.reddit.com/r/lavaporn/comments/2a00np/"), + ("https://redd.it/2a00np/"), ) def __init__(self, match): @@ -303,8 +317,8 @@ class RedditImageExtractor(Extractor): category = "reddit" subcategory = "image" archive_fmt = "{filename}" - pattern = (r"(?:https?://)?i\.redd(?:\.it|ituploads\.com)" - r"/[^/?#]+(?:\?[^#]*)?") + pattern = (r"(?:https?://)?((?:i|preview)\.redd\.it|i\.reddituploads\.com)" + r"/([^/?#]+)(\?[^#]*)?") test = ( ("https://i.redd.it/upjtjcx2npzz.jpg", { "url": "0de614900feef103e580b632190458c0b62b641a", @@ -315,12 +329,29 @@ class RedditImageExtractor(Extractor): "url": "f24f25efcedaddeec802e46c60d77ef975dc52a5", "content": "541dbcc3ad77aa01ee21ca49843c5e382371fae7", }), + # preview.redd.it -> i.redd.it + (("https://preview.redd.it/00af44lpn0u51.jpg?width=960&crop=smart" + "&auto=webp&v=enabled&s=dbca8ab84033f4a433772d9c15dbe0429c74e8ac"), { + "pattern": r"^https://i\.redd\.it/00af44lpn0u51\.jpg$" + }), ) + def __init__(self, match): + Extractor.__init__(self, match) + domain = match.group(1) + self.path = match.group(2) + if domain == "preview.redd.it": + self.domain = "i.redd.it" + self.query = "" + else: + self.domain = domain + self.query = match.group(3) or "" + def items(self): - data = text.nameext_from_url(self.url) + url = "https://{}/{}{}".format(self.domain, self.path, self.query) + data = text.nameext_from_url(url) yield Message.Directory, data - yield Message.Url, self.url, data + yield Message.Url, url, data class RedditAPI(): @@ -347,6 +378,18 @@ class RedditAPI(): self.client_id = client_id self.headers = {"User-Agent": config("user-agent")} + if self.client_id == self.CLIENT_ID: + client_id = self.client_id + self._warn_429 = True + kind = "default" + else: + client_id = client_id[:5] + "*" * (len(client_id)-5) + self._warn_429 = False + kind = "custom" + + self.log.debug( + "Using %s API credentials (client-id %s)", kind, client_id) + token = config("refresh-token") if token is None or token == "cache": key = "#" + self.client_id @@ -356,9 +399,9 @@ class RedditAPI(): if not self.refresh_token: # allow downloading from quarantined subreddits (#2180) - extractor._cookiejar.set( + extractor.cookies.set( "_options", '%7B%22pref_quarantine_optin%22%3A%20true%7D', - domain=extractor.cookiedomain) + domain=extractor.cookies_domain) def submission(self, submission_id): """Fetch the (submission, comments)=-tuple for a submission id""" @@ -433,28 +476,39 @@ class RedditAPI(): def _call(self, endpoint, params): url = "https://oauth.reddit.com" + endpoint params["raw_json"] = "1" - self.authenticate() - response = self.extractor.request( - url, params=params, headers=self.headers, fatal=None) - remaining = response.headers.get("x-ratelimit-remaining") - if remaining and float(remaining) < 2: - self.extractor.wait(seconds=response.headers["x-ratelimit-reset"]) - return self._call(endpoint, params) + while True: + self.authenticate() + response = self.extractor.request( + url, params=params, headers=self.headers, fatal=None) - try: - data = response.json() - except ValueError: - raise exception.StopExtraction(text.remove_html(response.text)) + remaining = response.headers.get("x-ratelimit-remaining") + if remaining and float(remaining) < 2: + if self._warn_429: + self._warn_429 = False + self.log.info( + "Register your own OAuth application and use its " + "credentials to prevent this error: " + "https://github.com/mikf/gallery-dl/blob/master" + "/docs/configuration.rst" + "#extractorredditclient-id--user-agent") + self.extractor.wait( + seconds=response.headers["x-ratelimit-reset"]) + continue - if "error" in data: - if data["error"] == 403: - raise exception.AuthorizationError() - if data["error"] == 404: - raise exception.NotFoundError() - self.log.debug(data) - raise exception.StopExtraction(data.get("message")) - return data + try: + data = response.json() + except ValueError: + raise exception.StopExtraction(text.remove_html(response.text)) + + if "error" in data: + if data["error"] == 403: + raise exception.AuthorizationError() + if data["error"] == 404: + raise exception.NotFoundError() + self.log.debug(data) + raise exception.StopExtraction(data.get("message")) + return data def _pagination(self, endpoint, params): id_min = self._parse_id("id-min", 0) diff --git a/gallery_dl/extractor/redgifs.py b/gallery_dl/extractor/redgifs.py index eaaef7d8..abd21b30 100644 --- a/gallery_dl/extractor/redgifs.py +++ b/gallery_dl/extractor/redgifs.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2020-2022 Mike Fährmann +# Copyright 2020-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -16,13 +16,16 @@ from ..cache import memcache class RedgifsExtractor(Extractor): """Base class for redgifs extractors""" category = "redgifs" - filename_fmt = "{category}_{id}.{extension}" + filename_fmt = \ + "{category}_{gallery:?//[:11]}{num:?_/_/>02}{id}.{extension}" archive_fmt = "{id}" root = "https://www.redgifs.com" def __init__(self, match): Extractor.__init__(self, match) self.key = match.group(1) + + def _init(self): self.api = RedgifsAPI(self) formats = self.config("format") @@ -34,16 +37,32 @@ class RedgifsExtractor(Extractor): def items(self): metadata = self.metadata() + for gif in self.gifs(): - url = self._process(gif) - if not url: - self.log.warning("Skipping '%s' (format not available)", - gif["id"]) - continue + + gallery = gif.get("gallery") + if gallery: + gifs = self.api.gallery(gallery)["gifs"] + enum = 1 + cnt = len(gifs) + else: + gifs = (gif,) + enum = 0 + cnt = 1 gif.update(metadata) + gif["count"] = cnt yield Message.Directory, gif - yield Message.Url, url, gif + + for num, gif in enumerate(gifs, enum): + url = self._process(gif) + if not url: + self.log.warning( + "Skipping '%s' (format not available)", gif["id"]) + continue + gif["num"] = num + gif["count"] = cnt + yield Message.Url, url, gif def _process(self, gif): gif["_fallback"] = formats = self._formats(gif) @@ -141,25 +160,61 @@ class RedgifsCollectionsExtractor(RedgifsExtractor): yield Message.Queue, url, collection +class RedgifsNichesExtractor(RedgifsExtractor): + """Extractor for redgifs niches""" + subcategory = "niches" + pattern = r"(?:https?://)?(?:www\.)?redgifs\.com/niches/([^/?#]+)" + test = ( + ("https://www.redgifs.com/niches/boobs", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "range": "1-20", + "count": 20, + }), + ("https://www.redgifs.com/niches/ass", { + "pattern": r"https://\w+\.redgifs\.com/[\w-]+\.mp4", + "range": "1-20", + "count": 20, + }), + ) + + def gifs(self): + return self.api.niches(self.key) + + class RedgifsSearchExtractor(RedgifsExtractor): """Extractor for redgifs search results""" subcategory = "search" directory_fmt = ("{category}", "Search", "{search}") - pattern = r"(?:https?://)?(?:\w+\.)?redgifs\.com/browse/?\?([^#]+)" + pattern = (r"(?:https?://)?(?:\w+\.)?redgifs\.com" + r"/(?:gifs/([^/?#]+)|browse)(?:/?\?([^#]+))?") test = ( + ("https://www.redgifs.com/gifs/jav", { + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", + "range": "1-10", + "count": 10, + }), ("https://www.redgifs.com/browse?tags=JAV", { "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.(mp4|jpg)", "range": "1-10", "count": 10, }), - ("https://v3.redgifs.com/browse?tags=JAV"), + ("https://www.redgifs.com/gifs/jav?order=best&verified=1"), ("https://www.redgifs.com/browse?type=i&verified=y&order=top7"), + ("https://v3.redgifs.com/browse?tags=JAV"), ) + def __init__(self, match): + RedgifsExtractor.__init__(self, match) + self.search, self.query = match.groups() + def metadata(self): - self.params = params = text.parse_query(self.key) - search = params.get("tags") or params.get("order") or "trending" - return {"search": search} + self.params = text.parse_query(self.query) + if self.search: + self.params["tags"] = text.unquote(self.search) + + return {"search": (self.params.get("tags") or + self.params.get("order") or + "trending")} def gifs(self): return self.api.search(self.params) @@ -178,6 +233,16 @@ class RedgifsImageExtractor(RedgifsExtractor): r"/FoolishForkedAbyssiniancat\.mp4", "content": "f6e03f1df9a2ff2a74092f53ee7580d2fb943533", }), + # gallery (#4021) + ("https://www.redgifs.com/watch/desertedbaregraywolf", { + "pattern": r"https://\w+\.redgifs\.com/[A-Za-z-]+\.jpg", + "count": 4, + "keyword": { + "num": int, + "count": 4, + "gallery": "187ad979693-1922-fc66-0000-a96fb07b8a5d", + }, + }), ("https://redgifs.com/ifr/FoolishForkedAbyssiniancat"), ("https://i.redgifs.com/i/FoolishForkedAbyssiniancat"), ("https://www.gifdeliverynetwork.com/foolishforkedabyssiniancat"), @@ -207,6 +272,10 @@ class RedgifsAPI(): endpoint = "/v2/gifs/" + gif_id.lower() return self._call(endpoint)["gif"] + def gallery(self, gallery_id): + endpoint = "/v2/gallery/" + gallery_id + return self._call(endpoint) + def user(self, user, order="best"): endpoint = "/v2/users/{}/search".format(user.lower()) params = {"order": order} @@ -225,10 +294,13 @@ class RedgifsAPI(): endpoint = "/v2/users/{}/collections".format(user) return self._pagination(endpoint, key="collections") + def niches(self, niche): + endpoint = "/v2/niches/{}/gifs".format(niche) + return self._pagination(endpoint) + def search(self, params): endpoint = "/v2/gifs/search" params["search_text"] = params.pop("tags", None) - params.pop("needSendGtm", None) return self._pagination(endpoint, params) def _call(self, endpoint, params=None): diff --git a/gallery_dl/extractor/rule34us.py b/gallery_dl/extractor/rule34us.py index 00b6972d..88331eaa 100644 --- a/gallery_dl/extractor/rule34us.py +++ b/gallery_dl/extractor/rule34us.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2021 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -19,8 +19,7 @@ class Rule34usExtractor(BooruExtractor): root = "https://rule34.us" per_page = 42 - def __init__(self, match): - BooruExtractor.__init__(self, match) + def _init(self): self._find_tags = re.compile( r'
  • ]*>', '') - count, pos = text.extract(page, ' of ', '\n', pos) + def metadata(self, page): + title = text.extr(page, "", "") manga, _, chapter = title.partition(" - Chapter ") return { - "manga": text.unescape(manga).replace("-", " "), - "chapter_string": chapter.partition(" - Page ")[0], - "count": text.parse_int(count), - "lang": "jp", - "language": "Japanese", + "manga" : text.unescape(manga).replace("-", " "), + "chapter" : chapter.partition(" - Page ")[0], + "chapter_minor": "", + "lang" : "ja", + "language" : "Japanese", } + + def images(self, page): + return [ + (text.ensure_http_scheme(url), None) + for url in text.extract_iter( + page, '= 30" + }), + ("https://loudbooru.com/post/list/original_character/1", { + "pattern": r"https://loudbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://giantessbooru.com/post/list/smiling/1", { + "pattern": r"https://giantessbooru\.com/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://tentaclerape.net/post/list/comic/1", { + "pattern": r"https://tentaclerape\.net/_images/[0-9a-f]{32}/\d+", + "range": "1-100", + "count": 100, + }), + ("https://booru.cavemanon.xyz/index.php?q=post/list/Amber/1", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/\d+\.\w+", + "range": "1-100", + "count": 100, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + lastindex = match.lastindex + self.tags = text.unquote(match.group(lastindex-2)) + self.page = match.group(lastindex-1) + + def metadata(self): + return {"search_tags": self.tags} + + def posts(self): + pnum = text.parse_int(self.page, 1) + file_url_fmt = self.file_url_fmt.format + + init = True + mime = "" + + while True: + url = "{}/post/list/{}/{}".format(self.root, self.tags, pnum) + page = self.request(url).text + extr = text.extract_from(page) + + if init: + init = False + has_mime = ("data-mime='" in page) + has_pid = ("data-post-id='" in page) + + while True: + if has_mime: + mime = extr("data-mime='", "'") + if has_pid: + pid = extr("data-post-id='", "'") + else: + pid = extr("href='/post/view/", "?") + + if not pid: + break + + tags, dimensions, size = extr("title='", "'").split(" // ") + width, _, height = dimensions.partition("x") + md5 = extr("/_thumbs/", "/") + + yield { + "file_url": file_url_fmt( + self.root, md5, pid, text.quote(tags), + mime.rpartition("/")[2] if mime else "jpg"), + "id": pid, + "md5": md5, + "tags": tags, + "width": width, + "height": height, + "size": text.parse_bytes(size[:-1]), + } + + pnum += 1 + if not extr(">Next<", ">"): + if not extr("/{}'>{}<".format(pnum, pnum), ">"): + return + + +class Shimmie2PostExtractor(Shimmie2Extractor): + """Extractor for single shimmie2 posts""" + subcategory = "post" + pattern = BASE_PATTERN + r"post/view/(\d+)" + test = ( + ("https://meme.museum/post/view/10243", { + "pattern": r"https://meme\.museum/_images/105febebcd5ca791ee332adc" + r"49971f78/10243%20-%20g%20beard%20open_source%20richar" + r"d_stallman%20stallman%20tagme%20text\.jpg", + "content": "45565f3f141fc960a8ae1168b80e718a494c52d2", + "keyword": { + "extension": "jpg", + "file_url": "https://meme.museum/_images/105febebcd5ca791ee332" + "adc49971f78/10243%20-%20g%20beard%20open_source%2" + "0richard_stallman%20stallman%20tagme%20text.jpg", + "filename": "10243 - g beard open_source richard_stallman " + "stallman tagme text", + "height": 451, + "id": 10243, + "md5": "105febebcd5ca791ee332adc49971f78", + "size": 0, + "subcategory": "post", + "tags": "/g/ beard open_source " + "richard_stallman stallman tagme text", + "width": 480, + }, + }), + ("https://loudbooru.com/post/view/33828", { + "pattern": r"https://loudbooru\.com/_images/.+\.png", + "content": "a4755f787ba23ae2aa297a46810f802ca9032739", + "keyword": { + "extension": "png", + "file_url": "https://loudbooru.com/_images/ca2638d903c86e8337f" + "e9aeb4974be88/33828%20-%202020%20artist%3Astikyfi" + "nkaz%20character%3Alisa_loud%20cover%20fanfiction" + "%3Aplatz_eins%20frowning%20half-closed_eyes%20sol" + "o%20text%20title_card.png", + "filename": "33828 - 2020 artist:stikyfinkaz character:lisa_" + "loud cover fanfiction:platz_eins frowning " + "half-closed_eyes solo text title_card", + "height": 1920, + "id": 33828, + "md5": "ca2638d903c86e8337fe9aeb4974be88", + "tags": "2020 artist:stikyfinkaz character:lisa_loud cover " + "fanfiction:platz_eins frowning half-closed_eyes " + "solo text title_card", + "width": 1078, + }, + }), + ("https://giantessbooru.com/post/view/41", { + "pattern": r"https://giantessbooru\.com/_images" + r"/3f67e1986496806b7b14ff3e82ac5af4/41\.jpg", + "content": "79115ed309d1f4e82e7bead6948760e889139c91", + "keyword": { + "extension": "jpg", + "file_url": "https://giantessbooru.com/_images" + "/3f67e1986496806b7b14ff3e82ac5af4/41.jpg", + "filename": "41", + "height": 0, + "id": 41, + "md5": "3f67e1986496806b7b14ff3e82ac5af4", + "size": 0, + "tags": "anime bare_midriff color drawing gentle giantess " + "karbo looking_at_tinies negeyari outdoors smiling " + "snake_girl white_hair", + "width": 0 + + + }, + }), + ("https://tentaclerape.net/post/view/10", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/10\.jpg", + "content": "d0fd8f0f6517a76cb5e23ba09f3844950bf2c516", + "keyword": { + "extension": "jpg", + "file_url": "https://tentaclerape.net/./index.php" + "?q=/image/10.jpg", + "filename": "10", + "height": 427, + "id": 10, + "md5": "945db71eeccaef82ce44b77564260c0b", + "size": 0, + "subcategory": "post", + "tags": "Deviant_Art Pet Tentacle artist_sche blonde_hair " + "blouse boots green_eyes highheels leash miniskirt " + "octopus schoolgirl white_skin willing", + "width": 300, + }, + }), + # video + ("https://tentaclerape.net/post/view/91267", { + "pattern": r"https://tentaclerape\.net/\./index\.php" + r"\?q=/image/91267\.mp4", + }), + ("https://booru.cavemanon.xyz/index.php?q=post/view/8335", { + "pattern": r"https://booru\.cavemanon\.xyz" + r"/index\.php\?q=image/8335\.png", + "content": "7158f7e4abbbf143bad5835eb93dbe4d68c1d4ab", + "keyword": { + "extension": "png", + "file_url": "https://booru.cavemanon.xyz" + "/index.php?q=image/8335.png", + "filename": "8335", + "height": 460, + "id": 8335, + "md5": "", + "size": 0, + "tags": "Color Fang", + "width": 459, + }, + }), + ) + + def __init__(self, match): + Shimmie2Extractor.__init__(self, match) + self.post_id = match.group(match.lastindex) + + def posts(self): + url = "{}/post/view/{}".format(self.root, self.post_id) + extr = text.extract_from(self.request(url).text) + + post = { + "id" : self.post_id, + "tags" : extr(": ", "<").partition(" - ")[0].rstrip(")"), + "md5" : extr("/_thumbs/", "/"), + "file_url": self.root + ( + extr("id='main_image' src='", "'") or + extr("").partition( + " ")[0].strip("\"'"), + "size" : 0, + } + + if not post["md5"]: + post["md5"] = text.extr(post["file_url"], "/_images/", "/") + + return (post,) diff --git a/gallery_dl/extractor/simplyhentai.py b/gallery_dl/extractor/simplyhentai.py index b5d116fd..d1ccc492 100644 --- a/gallery_dl/extractor/simplyhentai.py +++ b/gallery_dl/extractor/simplyhentai.py @@ -40,7 +40,9 @@ class SimplyhentaiGalleryExtractor(GalleryExtractor): path = "/" + subdomain.rstrip(".") + path url = "https://old.simply-hentai.com" + path GalleryExtractor.__init__(self, match, url) - self.session.headers["Referer"] = url + + def _init(self): + self.session.headers["Referer"] = self.gallery_url def metadata(self, page): extr = text.extract_from(page) diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index 3724c859..b643c6f2 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -22,6 +22,8 @@ class SkebExtractor(Extractor): def __init__(self, match): Extractor.__init__(self, match) self.user_name = match.group(1) + + def _init(self): self.thumbnails = self.config("thumbnails", False) self.article = self.config("article", False) diff --git a/gallery_dl/extractor/slideshare.py b/gallery_dl/extractor/slideshare.py index bea457f4..3521298f 100644 --- a/gallery_dl/extractor/slideshare.py +++ b/gallery_dl/extractor/slideshare.py @@ -30,21 +30,20 @@ class SlidesharePresentationExtractor(GalleryExtractor): "count": 19, "content": "2b6a191eab60b3978fdacfecf2da302dd45bc108", "keyword": { - "comments": "0", "description": "Get Started with SlideShare - " "A Beginngers Guide for Creators", - "likes": r"re:\d{3,}", + "likes": int, "presentation": "get-started-with-slide-share", - "published": "dt:2015-05-20 00:00:00", + "date": "dt:2015-05-20 17:38:21", "title": "Getting Started With SlideShare", "user": "Slideshare", - "views": r"re:\d{7,}", + "views": int, }, }), # long title and description (("https://www.slideshare.net/pragmaticsolutions/warum-sie-nicht-ihren" "-mitarbeitenden-ndern-sollten-sondern-ihr-managementsystem"), { - "url": "cf70ca99f57f61affab47ebf8583eb564b21e3a7", + "url": "d8952260f8bec337dd809a958ec8091350393f6b", "keyword": { "title": "Warum Sie nicht Ihren Mitarbeitenden ändern " "sollten, sondern Ihr Managementsystem", @@ -58,7 +57,7 @@ class SlidesharePresentationExtractor(GalleryExtractor): # mobile URL (("https://www.slideshare.net" "/mobile/uqudent/introduction-to-fixed-prosthodontics"), { - "url": "43eda2adf4dd221a251c8df794dfb82649e94647", + "url": "72c431cb1eccbb6794f608ecbbc01d52e8768159", }), ) @@ -69,43 +68,31 @@ class SlidesharePresentationExtractor(GalleryExtractor): GalleryExtractor.__init__(self, match, url) def metadata(self, page): - extr = text.extract_from(page) - descr = extr('', '') - published = extr('') - - if descr.endswith("…"): - alt_descr = extr('slideshow-description-text"', '

    ') - if alt_descr: - descr = text.remove_html(alt_descr.partition(">")[2]).strip() + data = util.json_loads(text.extr( + page, 'id="__NEXT_DATA__" type="application/json">', '')) + self.slideshow = slideshow = data["props"]["pageProps"]["slideshow"] return { - "user": self.user, + "user" : slideshow["username"], "presentation": self.presentation, - "title": text.unescape(title.strip()), - "description": text.unescape(descr), - "views": views, - "likes": likes, - "comments": comments, - "published": text.parse_datetime( - published.strip(), "%b. %d, %Y"), + "title" : slideshow["title"].strip(), + "description" : slideshow["description"].strip(), + "views" : slideshow["views"], + "likes" : slideshow["likes"], + "date" : text.parse_datetime( + slideshow["createdAt"], "%Y-%m-%d %H:%M:%S %Z"), } - @staticmethod - def images(page): - data = util.json_loads(text.extract( - page, "xtend(true, slideshare_object.slideshow_config, ", ");")[0]) + def images(self, page): + parts = self.slideshow["slideImages"][0]["baseUrl"].split("/") - # useing 'stripped_title' here is technically wrong, but it works all - # the same, slideshare doesn't seem to care what characters go there - begin = "https://image.slidesharecdn.com/{}/95/{}-".format( - data["ppt_location"], data["stripped_title"]) - end = "-1024.jpg?cb=" + str(data["timestamp"]) + begin = "{}/95/{}-".format( + "/".join(parts[:4]), + self.slideshow["strippedTitle"], + ) + end = "-1024.jpg?" + parts[-1].rpartition("?")[2] return [ (begin + str(n) + end, None) - for n in range(1, data["slide_count"]+1) + for n in range(1, self.slideshow["totalSlides"]+1) ] diff --git a/gallery_dl/extractor/smugmug.py b/gallery_dl/extractor/smugmug.py index 713d4c41..b9edd4ab 100644 --- a/gallery_dl/extractor/smugmug.py +++ b/gallery_dl/extractor/smugmug.py @@ -21,7 +21,7 @@ class SmugmugExtractor(Extractor): category = "smugmug" filename_fmt = ("{category}_{User[NickName]:?/_/}" "{Image[UploadKey]}_{Image[ImageKey]}.{extension}") - cookiedomain = None + cookies_domain = None empty_user = { "Uri": "", "ResponseLevel": "Public", @@ -34,8 +34,7 @@ class SmugmugExtractor(Extractor): "Uris": None, } - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): self.api = SmugmugAPI(self) self.videos = self.config("videos", True) self.session = self.api.session diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 4de7e9b5..a2e1388c 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -22,14 +22,14 @@ class SubscribestarExtractor(Extractor): directory_fmt = ("{category}", "{author_name}") filename_fmt = "{post_id}_{id}.{extension}" archive_fmt = "{id}" - cookiedomain = "www.subscribestar.com" - cookienames = ("auth_token",) + cookies_domain = "www.subscribestar.com" + cookies_names = ("auth_token",) def __init__(self, match): tld, self.item = match.groups() if tld == "adult": self.root = "https://subscribestar.adult" - self.cookiedomain = "subscribestar.adult" + self.cookies_domain = "subscribestar.adult" self.subcategory += "-adult" Extractor.__init__(self, match) @@ -49,12 +49,12 @@ class SubscribestarExtractor(Extractor): """Yield HTML content of all relevant posts""" def login(self): - if self._check_cookies(self.cookienames): + if self.cookies_check(self.cookies_names): return + username, password = self._get_auth_info() if username: - cookies = self._login_impl(username, password) - self._update_cookies(cookies) + self.cookies_update(self._login_impl(username, password)) @cache(maxage=28*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/szurubooru.py b/gallery_dl/extractor/szurubooru.py index 4b15b144..8c816ad1 100644 --- a/gallery_dl/extractor/szurubooru.py +++ b/gallery_dl/extractor/szurubooru.py @@ -20,8 +20,7 @@ class SzurubooruExtractor(booru.BooruExtractor): filename_fmt = "{id}_{version}_{checksumMD5}.{extension}" per_page = 100 - def __init__(self, match): - booru.BooruExtractor.__init__(self, match) + def _init(self): self.headers = { "Accept": "application/json", "Content-Type": "application/json", diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 545a95bb..0e09e22a 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -22,12 +22,11 @@ class TapasExtractor(Extractor): directory_fmt = ("{category}", "{series[title]}", "{id} {title}") filename_fmt = "{num:>02}.{extension}" archive_fmt = "{id}_{num}" - cookiedomain = ".tapas.io" - cookienames = ("_cpc_",) + cookies_domain = ".tapas.io" + cookies_names = ("_cpc_",) _cache = None - def __init__(self, match): - Extractor.__init__(self, match) + def _init(self): if self._cache is None: TapasExtractor._cache = {} @@ -70,14 +69,17 @@ class TapasExtractor(Extractor): yield Message.Url, url, text.nameext_from_url(url, episode) def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(self._login_impl(username, password)) - else: - sc = self.session.cookies.set - sc("birthDate" , "1981-02-03", domain=self.cookiedomain) - sc("adjustedBirthDate", "1981-02-03", domain=self.cookiedomain) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + return self.cookies_update(self._login_impl(username, password)) + + self.cookies.set( + "birthDate" , "1981-02-03", domain=self.cookies_domain) + self.cookies.set( + "adjustedBirthDate", "1981-02-03", domain=self.cookies_domain) @cache(maxage=14*24*3600, keyarg=1) def _login_impl(self, username, password): diff --git a/gallery_dl/extractor/tcbscans.py b/gallery_dl/extractor/tcbscans.py index cac5a545..b5a730a4 100644 --- a/gallery_dl/extractor/tcbscans.py +++ b/gallery_dl/extractor/tcbscans.py @@ -4,19 +4,20 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -"""Extractors for https://onepiecechapters.com/""" +"""Extractors for https://tcbscans.com/""" from .common import ChapterExtractor, MangaExtractor from .. import text +BASE_PATTERN = r"(?:https?://)?(?:tcbscans|onepiecechapters)\.com" + class TcbscansChapterExtractor(ChapterExtractor): category = "tcbscans" - pattern = (r"(?:https?://)?onepiecechapters\.com" - r"(/chapters/\d+/[^/?#]+)") - root = "https://onepiecechapters.com" + root = "https://tcbscans.com" + pattern = BASE_PATTERN + r"(/chapters/\d+/[^/?#]+)" test = ( - (("https://onepiecechapters.com" + (("https://tcbscans.com" "/chapters/4708/chainsaw-man-chapter-108"), { "pattern": (r"https://cdn\.[^/]+" r"/(file|attachments/[^/]+)/[^/]+/[^.]+\.\w+"), @@ -66,12 +67,11 @@ class TcbscansChapterExtractor(ChapterExtractor): class TcbscansMangaExtractor(MangaExtractor): category = "tcbscans" + root = "https://tcbscans.com" chapterclass = TcbscansChapterExtractor - pattern = (r"(?:https?://)?onepiecechapters\.com" - r"(/mangas/\d+/[^/?#]+)") - root = "https://onepiecechapters.com" + pattern = BASE_PATTERN + r"(/mangas/\d+/[^/?#]+)" test = ( - ("https://onepiecechapters.com/mangas/13/chainsaw-man", { + ("https://tcbscans.com/mangas/13/chainsaw-man", { "pattern": TcbscansChapterExtractor.pattern, "range" : "1-50", "count" : 50, diff --git a/gallery_dl/extractor/tsumino.py b/gallery_dl/extractor/tsumino.py index 92bd6347..e7d5226a 100644 --- a/gallery_dl/extractor/tsumino.py +++ b/gallery_dl/extractor/tsumino.py @@ -16,15 +16,15 @@ from ..cache import cache class TsuminoBase(): """Base class for tsumino extractors""" category = "tsumino" - cookiedomain = "www.tsumino.com" + cookies_domain = "www.tsumino.com" root = "https://www.tsumino.com" def login(self): username, password = self._get_auth_info() if username: - self._update_cookies(self._login_impl(username, password)) + self.cookies_update(self._login_impl(username, password)) else: - self.session.cookies.setdefault( + self.cookies.setdefault( "ASP.NET_SessionId", "x1drgggilez4cpkttneukrc5") @cache(maxage=14*24*3600, keyarg=1) @@ -37,7 +37,7 @@ class TsuminoBase(): response = self.request(url, method="POST", headers=headers, data=data) if not response.history: raise exception.AuthenticationError() - return self.session.cookies + return self.cookies class TsuminoGalleryExtractor(TsuminoBase, GalleryExtractor): diff --git a/gallery_dl/extractor/tumblr.py b/gallery_dl/extractor/tumblr.py index 155db1e5..9adc3ab1 100644 --- a/gallery_dl/extractor/tumblr.py +++ b/gallery_dl/extractor/tumblr.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2016-2022 Mike Fährmann +# Copyright 2016-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -10,7 +10,7 @@ from .common import Extractor, Message from .. import text, oauth, exception -from datetime import datetime, timedelta +from datetime import datetime, date, timedelta import re @@ -31,7 +31,7 @@ class TumblrExtractor(Extractor): directory_fmt = ("{category}", "{blog_name}") filename_fmt = "{category}_{blog_name}_{id}_{num:>02}.{extension}" archive_fmt = "{id}_{num}" - cookiedomain = None + cookies_domain = None def __init__(self, match): Extractor.__init__(self, match) @@ -42,6 +42,7 @@ class TumblrExtractor(Extractor): else: self.blog = match.group(1) or match.group(3) + def _init(self): self.api = TumblrAPI(self) self.types = self._setup_posttypes() self.avatar = self.config("avatar", False) @@ -269,7 +270,7 @@ class TumblrExtractor(Extractor): class TumblrUserExtractor(TumblrExtractor): - """Extractor for all images from a tumblr-user""" + """Extractor for a Tumblr user's posts""" subcategory = "user" pattern = BASE_PATTERN + r"(?:/page/\d+|/archive)?/?$" test = ( @@ -307,6 +308,16 @@ class TumblrUserExtractor(TumblrExtractor): "options": (("date-min", "201804"), ("date-max", "201805"), ("date-format", "%Y%m")) }), + # pagination with 'date-max' (#2191) and 'api-key' + ("https://donttrustthetits.tumblr.com/", { + "options": ( + ("access-token", None), + ("original", False), + ("date-max", "2015-04-25T00:00:00"), + ("date-min", "2015-04-01T00:00:00"), + ), + "count": 316, + }), ("https://demo.tumblr.com/page/2"), ("https://demo.tumblr.com/archive"), ("tumblr:http://www.b-authentique.com/"), @@ -321,7 +332,7 @@ class TumblrUserExtractor(TumblrExtractor): class TumblrPostExtractor(TumblrExtractor): - """Extractor for images from a single post on tumblr""" + """Extractor for a single Tumblr post""" subcategory = "post" pattern = BASE_PATTERN + r"/(?:post/|image/)?(\d+)" test = ( @@ -389,7 +400,7 @@ class TumblrPostExtractor(TumblrExtractor): class TumblrTagExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user by tag""" + """Extractor for Tumblr user's posts by tag""" subcategory = "tag" pattern = BASE_PATTERN + r"/tagged/([^/?#]+)" test = ( @@ -411,8 +422,40 @@ class TumblrTagExtractor(TumblrExtractor): return self.api.posts(self.blog, {"tag": self.tag}) +class TumblrDayExtractor(TumblrExtractor): + """Extractor for Tumblr user's posts by day""" + subcategory = "day" + pattern = BASE_PATTERN + r"/day/(\d\d\d\d/\d\d/\d\d)" + test = ( + ("https://mikf123.tumblr.com/day/2018/01/05", { + "pattern": r"https://64\.media\.tumblr\.com" + r"/1a2be8c63f1df58abd2622861696c72a" + r"/tumblr_ozm9nqst9t1wgha4yo1_1280\.jpg", + "keyword": {"id": 169341068404}, + "count": 1, + }), + ("https://www.tumblr.com/blog/view/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/blog/mikf123/day/2018/01/05"), + ("https://www.tumblr.com/mikf123/day/2018/01/05"), + ) + + def __init__(self, match): + TumblrExtractor.__init__(self, match) + year, month, day = match.group(4).split("/") + self.date_min = ( + # 719163 == date(1970, 1, 1).toordinal() + date(int(year), int(month), int(day)).toordinal() - 719163) * 86400 + + def _init(self): + TumblrExtractor._init(self) + self.api.before = self.date_min + 86400 + + def posts(self): + return self.api.posts(self.blog, {}) + + class TumblrLikesExtractor(TumblrExtractor): - """Extractor for images from a tumblr-user's liked posts""" + """Extractor for a Tumblr user's liked posts""" subcategory = "likes" directory_fmt = ("{category}", "{blog_name}", "likes") archive_fmt = "f_{blog[name]}_{id}_{num}" @@ -431,7 +474,11 @@ class TumblrLikesExtractor(TumblrExtractor): class TumblrAPI(oauth.OAuth1API): - """Minimal interface for the Tumblr API v2""" + """Interface for the Tumblr API v2 + + https://github.com/tumblr/docs/blob/master/api.md + """ + ROOT = "https://api.tumblr.com" API_KEY = "O3hU2tMi5e4Qs5t3vezEi6L0qRORJ5y9oUpSGsrWu8iA3UCc3B" API_SECRET = "sFdsK3PDdP2QpYMRAoq0oDnw0sFS24XigXmdfnaeNZpJpqAn03" BLOG_CACHE = {} @@ -442,55 +489,46 @@ class TumblrAPI(oauth.OAuth1API): def info(self, blog): """Return general information about a blog""" - if blog not in self.BLOG_CACHE: - self.BLOG_CACHE[blog] = self._call(blog, "info", {})["blog"] - return self.BLOG_CACHE[blog] + try: + return self.BLOG_CACHE[blog] + except KeyError: + endpoint = "/v2/blog/{}/info".format(blog) + params = {"api_key": self.api_key} if self.api_key else None + self.BLOG_CACHE[blog] = blog = self._call(endpoint, params)["blog"] + return blog def avatar(self, blog, size="512"): """Retrieve a blog avatar""" if self.api_key: - url_fmt = "https://api.tumblr.com/v2/blog/{}/avatar/{}?api_key={}" - return url_fmt.format(blog, size, self.api_key) + return "{}/v2/blog/{}/avatar/{}?api_key={}".format( + self.ROOT, blog, size, self.api_key) + endpoint = "/v2/blog/{}/avatar".format(blog) params = {"size": size} - data = self._call(blog, "avatar", params, allow_redirects=False) - return data["avatar_url"] + return self._call( + endpoint, params, allow_redirects=False)["avatar_url"] def posts(self, blog, params): """Retrieve published posts""" - params["offset"] = self.extractor.config("offset") or 0 - params["limit"] = 50 + params["offset"] = self.extractor.config("offset") + params["limit"] = "50" params["reblog_info"] = "true" + params["type"] = self.posts_type + params["before"] = self.before - if self.posts_type: - params["type"] = self.posts_type - if self.before: - params["before"] = self.before + if self.before and params["offset"]: + self.log.warning("'offset' and 'date-max' cannot be used together") - while True: - data = self._call(blog, "posts", params) - self.BLOG_CACHE[blog] = data["blog"] - yield from data["posts"] - params["offset"] += params["limit"] - if params["offset"] >= data["total_posts"]: - return + return self._pagination(blog, "/posts", params, cache=True) def likes(self, blog): """Retrieve liked posts""" params = {"limit": "50", "before": self.before} - while True: - posts = self._call(blog, "likes", params)["liked_posts"] - if not posts: - return - yield from posts - params["before"] = posts[-1]["liked_timestamp"] + return self._pagination(blog, "/likes", params, key="liked_posts") - def _call(self, blog, endpoint, params, **kwargs): - if self.api_key: - params["api_key"] = self.api_key - url = "https://api.tumblr.com/v2/blog/{}/{}".format( - blog, endpoint) - - response = self.request(url, params=params, **kwargs) + def _call(self, endpoint, params, **kwargs): + url = self.ROOT + endpoint + kwargs["params"] = params + response = self.request(url, **kwargs) try: data = response.json() @@ -535,7 +573,7 @@ class TumblrAPI(oauth.OAuth1API): if self.extractor.config("ratelimit") == "wait": self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) t = (datetime.now() + timedelta(seconds=float(reset))).time() raise exception.StopExtraction( @@ -547,6 +585,29 @@ class TumblrAPI(oauth.OAuth1API): if reset: self.log.info("Hourly API rate limit exceeded") self.extractor.wait(seconds=reset) - return self._call(blog, endpoint, params) + return self._call(endpoint, params, **kwargs) raise exception.StopExtraction(data) + + def _pagination(self, blog, endpoint, params, key="posts", cache=False): + endpoint = "/v2/blog/{}{}".format(blog, endpoint) + if self.api_key: + params["api_key"] = self.api_key + + while True: + data = self._call(endpoint, params) + + if cache: + self.BLOG_CACHE[blog] = data["blog"] + cache = False + + yield from data[key] + + try: + endpoint = data["_links"]["next"]["href"] + except KeyError: + return + + params = None + if self.api_key: + endpoint += "&api_key=" + self.api_key diff --git a/gallery_dl/extractor/twibooru.py b/gallery_dl/extractor/twibooru.py index 30bf2f15..c3e0a262 100644 --- a/gallery_dl/extractor/twibooru.py +++ b/gallery_dl/extractor/twibooru.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2022 Mike Fährmann +# Copyright 2022-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -22,11 +22,11 @@ class TwibooruExtractor(BooruExtractor): filename_fmt = "{id}_{filename}.{extension}" archive_fmt = "{id}" request_interval = 6.05 + page_start = 1 per_page = 50 root = "https://twibooru.org" - def __init__(self, match): - BooruExtractor.__init__(self, match) + def _init(self): self.api = TwibooruAPI(self) _file_url = operator.itemgetter("view_url") @@ -230,7 +230,7 @@ class TwibooruAPI(): elif not api_key: params["filter_id"] = "2" - params["page"] = 1 + params["page"] = extr.page_start params["per_page"] = per_page = extr.per_page while True: diff --git a/gallery_dl/extractor/twitter.py b/gallery_dl/extractor/twitter.py index 5e68f138..a2ca9c18 100644 --- a/gallery_dl/extractor/twitter.py +++ b/gallery_dl/extractor/twitter.py @@ -13,6 +13,7 @@ from .. import text, util, exception from ..cache import cache import itertools import json +import re BASE_PATTERN = r"(?:https?://)?(?:www\.|mobile\.)?(?:[fv]x)?twitter\.com" @@ -23,14 +24,16 @@ class TwitterExtractor(Extractor): directory_fmt = ("{category}", "{user[name]}") filename_fmt = "{tweet_id}_{num}.{extension}" archive_fmt = "{tweet_id}_{retweet_id}_{num}" - cookiedomain = ".twitter.com" - cookienames = ("auth_token",) + cookies_domain = ".twitter.com" + cookies_names = ("auth_token",) root = "https://twitter.com" browser = "firefox" def __init__(self, match): Extractor.__init__(self, match) self.user = match.group(1) + + def _init(self): self.textonly = self.config("text-tweets", False) self.retweets = self.config("retweets", False) self.replies = self.config("replies", True) @@ -75,6 +78,10 @@ class TwitterExtractor(Extractor): else: seen_tweets = None + if self.twitpic: + self._find_twitpic = re.compile( + r"https?(://twitpic\.com/(?!photos/)\w+)").findall + for tweet in self.tweets(): if "legacy" in tweet: @@ -231,12 +238,24 @@ class TwitterExtractor(Extractor): files.append({"url": url}) def _extract_twitpic(self, tweet, files): - for url in tweet["entities"].get("urls", ()): + urls = {} + + # collect URLs from entities + for url in tweet["entities"].get("urls") or (): url = url["expanded_url"] if "//twitpic.com/" not in url or "/photos/" in url: continue if url.startswith("http:"): url = "https" + url[4:] + urls[url] = None + + # collect URLs from text + for url in self._find_twitpic( + tweet.get("full_text") or tweet.get("text") or ""): + urls["https" + url] = None + + # extract actual URLs + for url in urls: response = self.request(url, fatal=False) if response.status_code >= 400: continue @@ -278,6 +297,8 @@ class TwitterExtractor(Extractor): tget("quoted_by_id_str")), "reply_id" : text.parse_int( tget("in_reply_to_status_id_str")), + "conversation_id": text.parse_int( + tget("conversation_id_str")), "date" : date, "author" : author, "user" : self._user or author, @@ -319,7 +340,11 @@ class TwitterExtractor(Extractor): return tdata def _transform_user(self, user): - uid = user.get("rest_id") or user["id_str"] + try: + uid = user.get("rest_id") or user["id_str"] + except KeyError: + # private/invalid user (#4349) + return {} try: return self._user_cache[uid] @@ -436,29 +461,26 @@ class TwitterExtractor(Extractor): """Yield all relevant tweet objects""" def login(self): - if not self._check_cookies(self.cookienames): - username, password = self._get_auth_info() - if username: - self._update_cookies(_login_impl(self, username, password)) + if self.cookies_check(self.cookies_names): + return + + username, password = self._get_auth_info() + if username: + self.cookies_update(_login_impl(self, username, password)) -class TwitterTimelineExtractor(TwitterExtractor): - """Extractor for a Twitter user timeline""" - subcategory = "timeline" +class TwitterUserExtractor(TwitterExtractor): + """Extractor for a Twitter user""" + subcategory = "user" pattern = (BASE_PATTERN + r"/(?!search)(?:([^/?#]+)/?(?:$|[?#])" r"|i(?:/user/|ntent/user\?user_id=)(\d+))") test = ( ("https://twitter.com/supernaturepics", { - "range": "1-40", - "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", - }), - # suspended account (#2216) - ("https://twitter.com/OptionalTypo", { - "exception": exception.NotFoundError, - }), - # suspended account user ID - ("https://twitter.com/id:772949683521978368", { - "exception": exception.NotFoundError, + "options": (("include", "all"),), + "pattern": r"https://twitter\.com/supernaturepics" + r"/(photo|header_photo|timeline|tweets" + r"|media|with_replies|likes)$", + "count": 7, }), ("https://mobile.twitter.com/supernaturepics?p=i"), ("https://www.twitter.com/id:2976459548"), @@ -474,6 +496,43 @@ class TwitterTimelineExtractor(TwitterExtractor): if user_id: self.user = "id:" + user_id + def initialize(self): + pass + + def items(self): + base = "{}/{}/".format(self.root, self.user) + return self._dispatch_extractors(( + (TwitterAvatarExtractor , base + "photo"), + (TwitterBackgroundExtractor, base + "header_photo"), + (TwitterTimelineExtractor , base + "timeline"), + (TwitterTweetsExtractor , base + "tweets"), + (TwitterMediaExtractor , base + "media"), + (TwitterRepliesExtractor , base + "with_replies"), + (TwitterLikesExtractor , base + "likes"), + ), ("timeline",)) + + +class TwitterTimelineExtractor(TwitterExtractor): + """Extractor for a Twitter user timeline""" + subcategory = "timeline" + pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" + test = ( + ("https://twitter.com/supernaturepics/timeline", { + "range": "1-40", + "url": "c570ac1aae38ed1463be726cc46f31cac3d82a40", + }), + # suspended account (#2216) + ("https://twitter.com/OptionalTypo/timeline", { + "exception": exception.NotFoundError, + }), + # suspended account user ID + ("https://twitter.com/id:772949683521978368/timeline", { + "exception": exception.NotFoundError, + }), + ("https://mobile.twitter.com/supernaturepics/timeline#t"), + ("https://www.twitter.com/id:2976459548/timeline"), + ) + def tweets(self): # yield initial batch of (media) tweets tweet = None @@ -491,13 +550,13 @@ class TwitterTimelineExtractor(TwitterExtractor): if not self.textonly: # try to search for media-only tweets tweet = None - for tweet in self.api.search_adaptive(query + " filter:links"): + for tweet in self.api.search_timeline(query + " filter:links"): yield tweet if tweet is not None: return # yield unfiltered search results - yield from self.api.search_adaptive(query) + yield from self.api.search_timeline(query) def _select_tweet_source(self): strategy = self.config("strategy") @@ -647,8 +706,8 @@ class TwitterSearchExtractor(TwitterExtractor): subcategory = "search" pattern = BASE_PATTERN + r"/search/?\?(?:[^&#]+&)*q=([^&#]+)" test = ("https://twitter.com/search?q=nature", { - "range": "1-40", - "count": 40, + "range": "1-20", + "count": 20, "archive": False, }) @@ -674,7 +733,7 @@ class TwitterSearchExtractor(TwitterExtractor): except KeyError: pass - return self.api.search_adaptive(query) + return self.api.search_timeline(query) class TwitterHashtagExtractor(TwitterExtractor): @@ -781,7 +840,13 @@ class TwitterTweetExtractor(TwitterExtractor): ("https://twitter.com/i/web/status/112900228289540096", { "options": (("twitpic", True), ("cards", False)), "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.jpg", - "count": 3, + "count": 2, # 1 duplicate + }), + # TwitPic URL not in 'urls' (#3792) + ("https://twitter.com/shimoigusaP/status/8138669971", { + "options": (("twitpic", True),), + "pattern": r"https://\w+.cloudfront.net/photos/large/\d+.png", + "count": 1, }), # Twitter card (#1005) ("https://twitter.com/billboard/status/1306599586602135555", { @@ -894,14 +959,36 @@ Your reaction.""", self.tweet_id = match.group(2) def tweets(self): - if self.config("conversations", False): + conversations = self.config("conversations") + if conversations: + self._accessible = (conversations == "accessible") return self._tweets_conversation(self.tweet_id) - else: - return self._tweets_single(self.tweet_id) + + endpoint = self.config("tweet-endpoint") + if endpoint == "detail" or endpoint in (None, "auto") and \ + self.api.headers["x-twitter-auth-type"]: + return self._tweets_detail(self.tweet_id) + + return self._tweets_single(self.tweet_id) def _tweets_single(self, tweet_id): tweets = [] + tweet = self.api.tweet_result_by_rest_id(tweet_id) + self._assign_user(tweet["core"]["user_results"]["result"]) + + while True: + tweets.append(tweet) + tweet_id = tweet["legacy"].get("quoted_status_id_str") + if not tweet_id: + break + tweet = self.api.tweet_result_by_rest_id(tweet_id) + + return tweets + + def _tweets_detail(self, tweet_id): + tweets = [] + for tweet in self.api.tweet_detail(tweet_id): if tweet["rest_id"] == tweet_id or \ tweet.get("_retweet_id_str") == tweet_id: @@ -925,6 +1012,11 @@ Your reaction.""", tweet.get("_retweet_id_str") == tweet_id: self._assign_user(tweet["core"]["user_results"]["result"]) break + else: + # initial Tweet not accessible + if self._accessible: + return () + return buffer return itertools.chain(buffer, tweets) @@ -1035,24 +1127,24 @@ class TwitterAPI(): def __init__(self, extractor): self.extractor = extractor - self.root = "https://api.twitter.com" + self.root = "https://twitter.com/i/api" self._nsfw_warning = True self._syndication = self.extractor.syndication self._json_dumps = json.JSONEncoder(separators=(",", ":")).encode - cookies = extractor.session.cookies - cookiedomain = extractor.cookiedomain + cookies = extractor.cookies + cookies_domain = extractor.cookies_domain csrf = extractor.config("csrf") if csrf is None or csrf == "cookies": - csrf_token = cookies.get("ct0", domain=cookiedomain) + csrf_token = cookies.get("ct0", domain=cookies_domain) else: csrf_token = None if not csrf_token: csrf_token = util.generate_token() - cookies.set("ct0", csrf_token, domain=cookiedomain) + cookies.set("ct0", csrf_token, domain=cookies_domain) - auth_token = cookies.get("auth_token", domain=cookiedomain) + auth_token = cookies.get("auth_token", domain=cookies_domain) self.headers = { "Accept": "*/*", @@ -1064,7 +1156,6 @@ class TwitterAPI(): "x-twitter-client-language": "en", "x-twitter-active-user": "yes", "x-csrf-token": csrf_token, - "Origin": "https://twitter.com", "Referer": "https://twitter.com/", } self.params = { @@ -1108,47 +1199,84 @@ class TwitterAPI(): "enrichments,superFollowMetadata,unmentionInfo,editControl," "collab_control,vibe", } - self.variables = { - "withDownvotePerspective": False, - "withReactionsMetadata": False, - "withReactionsPerspective": False, - } self.features = { - "blue_business_profile_image_shape_enabled": False, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "hidden_profile_likes_enabled": False, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, - "responsive_web_graphql_skip_user_profile_" - "image_extensions_enabled": False, + "subscriptions_verification_info_verified_since_enabled": True, + "highlights_tweets_tab_ui_enabled": True, + "creator_subscriptions_tweet_preview_api_enabled": True, + "responsive_web_graphql_" + "skip_user_profile_image_extensions_enabled": False, "responsive_web_graphql_timeline_navigation_enabled": True, } self.features_pagination = { - "blue_business_profile_image_shape_enabled": False, - "responsive_web_twitter_blue_verified_badge_is_enabled": True, + "rweb_lists_timeline_redesign_enabled": True, "responsive_web_graphql_exclude_directive_enabled": True, "verified_phone_label_enabled": False, + "creator_subscriptions_tweet_preview_api_enabled": True, "responsive_web_graphql_timeline_navigation_enabled": True, "responsive_web_graphql_skip_user_profile_" "image_extensions_enabled": False, "tweetypie_unmention_optimization_enabled": True, - "vibe_api_enabled": True, "responsive_web_edit_tweet_api_enabled": True, "graphql_is_translatable_rweb_tweet_is_translatable_enabled": True, "view_counts_everywhere_api_enabled": True, "longform_notetweets_consumption_enabled": True, "tweet_awards_web_tipping_enabled": False, - "freedom_of_speech_not_reach_fetch_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, "standardized_nudges_misinfo": True, "tweet_with_visibility_results_prefer_gql_" "limited_actions_policy_enabled": False, "interactive_text_enabled": True, "responsive_web_text_conversations_enabled": False, - "longform_notetweets_richtext_consumption_enabled": False, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": False, "responsive_web_enhance_cards_enabled": False, } + def tweet_result_by_rest_id(self, tweet_id): + endpoint = "/graphql/2ICDjqPd81tulZcYrtpTuQ/TweetResultByRestId" + params = { + "variables": self._json_dumps({ + "tweetId": tweet_id, + "withCommunity": False, + "includePromotedContent": False, + "withVoice": False, + }), + "features": self._json_dumps({ + "creator_subscriptions_tweet_preview_api_enabled": True, + "tweetypie_unmention_optimization_enabled": True, + "responsive_web_edit_tweet_api_enabled": True, + "graphql_is_translatable_rweb_tweet_is_translatable_enabled": + True, + "view_counts_everywhere_api_enabled": True, + "longform_notetweets_consumption_enabled": True, + "responsive_web_twitter_article_tweet_consumption_enabled": + False, + "tweet_awards_web_tipping_enabled": False, + "freedom_of_speech_not_reach_fetch_enabled": True, + "standardized_nudges_misinfo": True, + "tweet_with_visibility_results_prefer_gql_" + "limited_actions_policy_enabled": True, + "longform_notetweets_rich_text_read_enabled": True, + "longform_notetweets_inline_media_enabled": True, + "responsive_web_graphql_exclude_directive_enabled": True, + "verified_phone_label_enabled": False, + "responsive_web_media_download_video_enabled": False, + "responsive_web_graphql_skip_user_profile_" + "image_extensions_enabled": False, + "responsive_web_graphql_timeline_navigation_enabled": True, + "responsive_web_enhance_cards_enabled": False, + }), + "fieldToggles": self._json_dumps({ + "withArticleRichContentState": False, + }), + } + return self._call(endpoint, params)["data"]["tweetResult"]["result"] + def tweet_detail(self, tweet_id): - endpoint = "/graphql/AV_lPTkN6Fc6LgerQpK8Zg/TweetDetail" + endpoint = "/graphql/JlLZj42Ltr2qwjasw-l5lQ/TweetDetail" variables = { "focalTweetId": tweet_id, "referrer": "profile", @@ -1156,9 +1284,7 @@ class TwitterAPI(): "includePromotedContent": True, "withCommunity": True, "withQuickPromoteEligibilityTweetFields": True, - "withBirdwatchNotes": False, - "withSuperFollowsUserFields": True, - "withSuperFollowsTweetFields": True, + "withBirdwatchNotes": True, "withVoice": True, "withV2Timeline": True, } @@ -1166,7 +1292,7 @@ class TwitterAPI(): endpoint, variables, ("threaded_conversation_with_injections_v2",)) def user_tweets(self, screen_name): - endpoint = "/graphql/BeHK76TOCY3P8nO-FWocjA/UserTweets" + endpoint = "/graphql/-AY51QoFpVf-w7TxjQ6lpw/UserTweets" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1178,7 +1304,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_tweets_and_replies(self, screen_name): - endpoint = "/graphql/eZVlZu_1gwb6hMUDXBnZoQ/UserTweetsAndReplies" + endpoint = "/graphql/urrCZMyyIh1FkSFi2cdPUA/UserTweetsAndReplies" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1190,7 +1316,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_media(self, screen_name): - endpoint = "/graphql/d_ONZLUHGCsErBCriRsLXg/UserMedia" + endpoint = "/graphql/lo965xQZdN2-eSM1Jc-W_A/UserMedia" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1223,7 +1349,7 @@ class TwitterAPI(): features=False) def user_likes(self, screen_name): - endpoint = "/graphql/fN4-E0MjFJ9Cn7IYConL7g/Likes" + endpoint = "/graphql/6JET1d0iHsIzW0Zjs3OOwQ/Likes" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1236,7 +1362,7 @@ class TwitterAPI(): return self._pagination_tweets(endpoint, variables) def user_bookmarks(self): - endpoint = "/graphql/RV1g3b8n_SGOHwkqKYSCFw/Bookmarks" + endpoint = "/graphql/YNtYqNuki6_oiVwx0uP8mQ/Bookmarks" variables = { "count": 100, } @@ -1247,7 +1373,7 @@ class TwitterAPI(): features=features) def list_latest_tweets_timeline(self, list_id): - endpoint = "/graphql/5DAiJG3bD77SiWEs4xViBw/ListLatestTweetsTimeline" + endpoint = "/graphql/ZBbXrl37E6za5ml-DIpmgg/ListLatestTweetsTimeline" variables = { "listId": list_id, "count": 100, @@ -1255,15 +1381,23 @@ class TwitterAPI(): return self._pagination_tweets( endpoint, variables, ("list", "tweets_timeline", "timeline")) - def search_adaptive(self, query): - endpoint = "/2/search/adaptive.json" - params = self.params.copy() - params["q"] = query - params["tweet_search_mode"] = "live" - params["query_source"] = "typed_query" - params["pc"] = "1" - params["spelling_corrections"] = "1" - return self._pagination_legacy(endpoint, params) + def search_timeline(self, query): + endpoint = "/graphql/7jT5GT59P8IFjgxwqnEdQw/SearchTimeline" + variables = { + "rawQuery": query, + "count": 20, + "product": "Latest", + "withDownvotePerspective": False, + "withReactionsMetadata": False, + "withReactionsPerspective": False, + } + features = self.features_pagination.copy() + features["blue_business_profile_image_shape_enabled"] = False + features["vibe_api_enabled"] = True + return self._pagination_tweets( + endpoint, variables, + ("search_by_raw_query", "search_timeline", "timeline"), + features=features) def live_event_timeline(self, event_id): endpoint = "/2/live_event/timeline/{}.json".format(event_id) @@ -1282,11 +1416,10 @@ class TwitterAPI(): ["twitter_objects"]["live_events"][event_id]) def list_by_rest_id(self, list_id): - endpoint = "/graphql/D0EoyrDcct2MEqC-LnPzFg/ListByRestId" + endpoint = "/graphql/AmCdeFUvlrKAO96yHr-GCg/ListByRestId" params = { "variables": self._json_dumps({ "listId": list_id, - "withSuperFollowsUserFields": True, }), "features": self._json_dumps(self.features), } @@ -1296,7 +1429,7 @@ class TwitterAPI(): raise exception.NotFoundError("list") def list_members(self, list_id): - endpoint = "/graphql/tzsIIbGUH9RyFCVmtO2W2w/ListMembers" + endpoint = "/graphql/a_ZQomd3MMk1crWkeiQBPg/ListMembers" variables = { "listId": list_id, "count": 100, @@ -1306,7 +1439,7 @@ class TwitterAPI(): endpoint, variables, ("list", "members_timeline", "timeline")) def user_following(self, screen_name): - endpoint = "/graphql/FaBzCqZXuQCb4PhB0RHqHw/Following" + endpoint = "/graphql/JPZiqKjET7_M1r5Tlr8pyA/Following" variables = { "userId": self._user_id_by_screen_name(screen_name), "count": 100, @@ -1315,18 +1448,20 @@ class TwitterAPI(): return self._pagination_users(endpoint, variables) def user_by_rest_id(self, rest_id): - endpoint = "/graphql/S2BkcAyFMG--jef2N6Dgzw/UserByRestId" + endpoint = "/graphql/1YAM811Q8Ry4XyPpJclURQ/UserByRestId" + features = self.features.copy() + features["blue_business_profile_image_shape_enabled"] = True params = { "variables": self._json_dumps({ "userId": rest_id, "withSafetyModeUserFields": True, }), - "features": self._json_dumps(self.features), + "features": self._json_dumps(features), } return self._call(endpoint, params)["data"]["user"]["result"] def user_by_screen_name(self, screen_name): - endpoint = "/graphql/k26ASEiniqy4eXMdknTSoQ/UserByScreenName" + endpoint = "/graphql/XA6F1nJELYg65hxOC2Ekmg/UserByScreenName" params = { "variables": self._json_dumps({ "screen_name": screen_name, @@ -1357,17 +1492,19 @@ class TwitterAPI(): def _guest_token(self): endpoint = "/1.1/guest/activate.json" self.extractor.log.info("Requesting guest token") - return str(self._call(endpoint, None, "POST", False)["guest_token"]) + return str(self._call( + endpoint, None, "POST", False, "https://api.twitter.com", + )["guest_token"]) def _authenticate_guest(self): guest_token = self._guest_token() if guest_token != self.headers["x-guest-token"]: self.headers["x-guest-token"] = guest_token - self.extractor.session.cookies.set( - "gt", guest_token, domain=self.extractor.cookiedomain) + self.extractor.cookies.set( + "gt", guest_token, domain=self.extractor.cookies_domain) - def _call(self, endpoint, params, method="GET", auth=True): - url = self.root + endpoint + def _call(self, endpoint, params, method="GET", auth=True, root=None): + url = (root or self.root) + endpoint while True: if not self.headers["x-twitter-auth-type"] and auth: @@ -1388,11 +1525,20 @@ class TwitterAPI(): if response.status_code == 429: # rate limit exceeded + if self.extractor.config("ratelimit") == "abort": + raise exception.StopExtraction("Rate limit exceeded") + until = response.headers.get("x-rate-limit-reset") seconds = None if until else 60 self.extractor.wait(until=until, seconds=seconds) continue + if response.status_code == 403 and \ + not self.headers["x-twitter-auth-type"] and \ + endpoint == "/2/search/adaptive.json": + raise exception.AuthorizationError( + "Login required to access search results") + # error try: data = response.json() @@ -1501,7 +1647,6 @@ class TwitterAPI(): def _pagination_tweets(self, endpoint, variables, path=None, stop_tweets=True, features=None): extr = self.extractor - variables.update(self.variables) original_retweets = (extr.retweets == "original") pinned_tweet = extr.pinned @@ -1525,12 +1670,20 @@ class TwitterAPI(): instructions = instructions[key] instructions = instructions["instructions"] + cursor = None + entries = None for instr in instructions: - if instr.get("type") == "TimelineAddEntries": + instr_type = instr.get("type") + if instr_type == "TimelineAddEntries": entries = instr["entries"] - break - else: - raise KeyError() + elif instr_type == "TimelineReplaceEntry": + entry = instr["entry"] + if entry["entryId"].startswith("cursor-bottom-"): + cursor = entry["content"]["value"] + if entries is None: + if not cursor: + return + entries = () except LookupError: extr.log.debug(data) @@ -1541,8 +1694,8 @@ class TwitterAPI(): if user.get("blocked_by"): if self.headers["x-twitter-auth-type"] and \ extr.config("logout"): - extr._cookiefile = None - del extr.session.cookies["auth_token"] + extr.cookies_file = None + del extr.cookies["auth_token"] self.headers["x-twitter-auth-type"] = None extr.log.info("Retrying API request as guest") continue @@ -1558,7 +1711,7 @@ class TwitterAPI(): "Unable to retrieve Tweets from this timeline") tweets = [] - tweet = cursor = None + tweet = None if pinned_tweet: pinned_tweet = False @@ -1664,12 +1817,11 @@ class TwitterAPI(): variables["cursor"] = cursor def _pagination_users(self, endpoint, variables, path=None): - variables.update(self.variables) params = {"variables": None, "features" : self._json_dumps(self.features_pagination)} while True: - cursor = entry = stop = None + cursor = entry = None params["variables"] = self._json_dumps(variables) data = self._call(endpoint, params)["data"] @@ -1698,11 +1850,8 @@ class TwitterAPI(): yield user elif entry["entryId"].startswith("cursor-bottom-"): cursor = entry["content"]["value"] - elif instr["type"] == "TimelineTerminateTimeline": - if instr["direction"] == "Bottom": - stop = True - if stop or not cursor or not entry: + if not cursor or cursor.startswith(("-1|", "0|")) or not entry: return variables["cursor"] = cursor @@ -1800,7 +1949,7 @@ def _login_impl(extr, username, password): extr.log.debug(response.text) raise exception.AuthenticationError(", ".join(errors)) - extr.session.cookies.clear() + extr.cookies.clear() api = TwitterAPI(extr) api._authenticate_guest() headers = api.headers @@ -1940,5 +2089,5 @@ def _login_impl(extr, username, password): return { cookie.name: cookie.value - for cookie in extr.session.cookies + for cookie in extr.cookies } diff --git a/gallery_dl/extractor/urlshortener.py b/gallery_dl/extractor/urlshortener.py index 1a39b5be..4b49a638 100644 --- a/gallery_dl/extractor/urlshortener.py +++ b/gallery_dl/extractor/urlshortener.py @@ -34,7 +34,7 @@ BASE_PATTERN = UrlshortenerExtractor.update(INSTANCES) class UrlshortenerLinkExtractor(UrlshortenerExtractor): """Extractor for general-purpose URL shorteners""" subcategory = "link" - pattern = BASE_PATTERN + r"/([^/?&#]+)" + pattern = BASE_PATTERN + r"/([^/?#]+)" test = ( ("https://bit.ly/3cWIUgq", { "count": 1, @@ -54,6 +54,7 @@ class UrlshortenerLinkExtractor(UrlshortenerExtractor): UrlshortenerExtractor.__init__(self, match) self.id = match.group(match.lastindex) + def _init(self): try: self.headers = INSTANCES[self.category]["headers"] except Exception: diff --git a/gallery_dl/extractor/vipergirls.py b/gallery_dl/extractor/vipergirls.py index 1cebdf75..084f9b25 100644 --- a/gallery_dl/extractor/vipergirls.py +++ b/gallery_dl/extractor/vipergirls.py @@ -9,7 +9,10 @@ """Extractors for https://vipergirls.to/""" from .common import Extractor, Message -from .. import text, exception +from .. import text, util, exception +from ..cache import cache + +from xml.etree import ElementTree BASE_PATTERN = r"(?:https?://)?(?:www\.)?vipergirls\.to" @@ -18,26 +21,51 @@ class VipergirlsExtractor(Extractor): """Base class for vipergirls extractors""" category = "vipergirls" root = "https://vipergirls.to" + request_interval = 0.5 + request_interval_min = 0.2 + cookies_domain = ".vipergirls.to" + cookies_names = ("vg_userid", "vg_password") - def __init__(self, match): - Extractor.__init__(self, match) - self.session.headers["Referer"] = self.root + def _init(self): + self.session.headers["Referer"] = self.root + "/" def items(self): - for html in self.posts(): + self.login() - pos = html.find('
    ")[2].strip()), - } + for post in self.posts(): + data = post.attrib + data["thread_id"] = self.thread_id yield Message.Directory, data - for href in text.extract_iter(html, '', '') - - url = text.extr(page, '
  • ") tags.append(category + ":" + name.strip()) return data diff --git a/gallery_dl/formatter.py b/gallery_dl/formatter.py index fc36fa2c..500eaa19 100644 --- a/gallery_dl/formatter.py +++ b/gallery_dl/formatter.py @@ -9,6 +9,7 @@ """String formatters""" import os +import sys import time import string import _string @@ -255,7 +256,11 @@ def parse_field_name(field_name): func = operator.itemgetter try: if ":" in key: - key = _slice(key) + if key[0] == "b": + func = _bytesgetter + key = _slice(key[1:]) + else: + key = _slice(key) else: key = key.strip("\"'") except TypeError: @@ -276,6 +281,14 @@ def _slice(indices): ) +def _bytesgetter(slice, encoding=sys.getfilesystemencoding()): + + def apply_slice_bytes(obj): + return obj.encode(encoding)[slice].decode(encoding, "ignore") + + return apply_slice_bytes + + def _build_format_func(format_spec, default): if format_spec: return _FORMAT_SPECIFIERS.get( @@ -295,11 +308,20 @@ def _parse_optional(format_spec, default): def _parse_slice(format_spec, default): indices, _, format_spec = format_spec.partition("]") - slice = _slice(indices[1:]) fmt = _build_format_func(format_spec, default) - def apply_slice(obj): - return fmt(obj[slice]) + if indices[1] == "b": + slice_bytes = _bytesgetter(_slice(indices[2:])) + + def apply_slice(obj): + return fmt(slice_bytes(obj)) + + else: + slice = _slice(indices[1:]) + + def apply_slice(obj): + return fmt(obj[slice]) + return apply_slice @@ -415,6 +437,7 @@ _CONVERSIONS = { "T": util.datetime_to_timestamp_string, "d": text.parse_timestamp, "U": text.unescape, + "H": lambda s: text.unescape(text.remove_html(s)), "g": text.slugify, "S": util.to_string, "s": str, diff --git a/gallery_dl/job.py b/gallery_dl/job.py index ca5785d9..f169788e 100644 --- a/gallery_dl/job.py +++ b/gallery_dl/job.py @@ -32,6 +32,21 @@ class Job(): self.kwdict = {} self.status = 0 + cfgpath = [] + if parent and parent.extractor.category != extr.category: + cat = "{}>{}".format( + parent.extractor.category, extr.category) + cfgpath.append((cat, extr.subcategory)) + cfgpath.append((extr.category, extr.subcategory)) + if extr.basecategory: + if not cfgpath: + cfgpath.append((extr.category, extr.subcategory)) + cfgpath.append((extr.basecategory, extr.subcategory)) + if cfgpath: + extr._cfgpath = cfgpath + extr.config = extr._config_shared + extr.config_accumulate = extr._config_shared_accumulate + actions = extr.config("actions") if actions: from .actions import parse @@ -125,8 +140,7 @@ class Job(): log.info("No results for %s", extractor.url) finally: self.handle_finalize() - if extractor.finalize: - extractor.finalize() + extractor.finalize() return self.status @@ -378,7 +392,7 @@ class DownloadJob(Job): for callback in hooks["post-after"]: callback(pathfmt) - self.extractor._store_cookies() + self.extractor.cookies_store() if "finalize" in hooks: status = self.status for callback in hooks["finalize"]: diff --git a/gallery_dl/option.py b/gallery_dl/option.py index aad307f3..08e6e701 100644 --- a/gallery_dl/option.py +++ b/gallery_dl/option.py @@ -148,20 +148,6 @@ def build_parser(): help="Delete cached login sessions, cookies, etc. for MODULE " "(ALL to delete everything)", ) - general.add_argument( - "--cookies", - dest="cookies", metavar="FILE", action=ConfigAction, - help="File to load additional cookies from", - ) - general.add_argument( - "--cookies-from-browser", - dest="cookies_from_browser", - metavar="BROWSER[+KEYRING][:PROFILE][::CONTAINER]", - help=("Name of the browser to load cookies from, " - "with optional keyring name prefixed with '+', " - "profile prefixed with ':', and " - "container prefixed with '::' ('none' for no container)"), - ) output = parser.add_argument_group("Output Options") output.add_argument( @@ -374,6 +360,28 @@ def build_parser(): help="Enable .netrc authentication data", ) + cookies = parser.add_argument_group("Cookie Options") + cookies.add_argument( + "-C", "--cookies", + dest="cookies", metavar="FILE", action=ConfigAction, + help="File to load additional cookies from", + ) + cookies.add_argument( + "--cookies-export", + dest="cookies-update", metavar="FILE", action=ConfigAction, + help="Export session cookies to FILE", + ) + cookies.add_argument( + "--cookies-from-browser", + dest="cookies_from_browser", + metavar="BROWSER[/DOMAIN][+KEYRING][:PROFILE][::CONTAINER]", + help=("Name of the browser to load cookies from, with optional " + "domain prefixed with '/', " + "keyring name prefixed with '+', " + "profile prefixed with ':', and " + "container prefixed with '::' ('none' for no container)"), + ) + selection = parser.add_argument_group("Selection Options") selection.add_argument( "--download-archive", diff --git a/gallery_dl/postprocessor/__init__.py b/gallery_dl/postprocessor/__init__.py index ee490e79..46905547 100644 --- a/gallery_dl/postprocessor/__init__.py +++ b/gallery_dl/postprocessor/__init__.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -# Copyright 2018-2021 Mike Fährmann +# Copyright 2018-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -14,6 +14,7 @@ modules = [ "exec", "metadata", "mtime", + "python", "ugoira", "zip", ] diff --git a/gallery_dl/postprocessor/common.py b/gallery_dl/postprocessor/common.py index c28d060d..10d9fbab 100644 --- a/gallery_dl/postprocessor/common.py +++ b/gallery_dl/postprocessor/common.py @@ -45,5 +45,7 @@ class PostProcessor(): self.name, archive, exc.__class__.__name__, exc) else: self.log.debug("Using %s archive '%s'", self.name, archive) + return True else: self.archive = None + return False diff --git a/gallery_dl/postprocessor/exec.py b/gallery_dl/postprocessor/exec.py index e81c6cfe..39188f16 100644 --- a/gallery_dl/postprocessor/exec.py +++ b/gallery_dl/postprocessor/exec.py @@ -11,6 +11,7 @@ from .common import PostProcessor from .. import util, formatter import subprocess +import os if util.WINDOWS: @@ -60,6 +61,7 @@ class ExecPP(PostProcessor): kwdict["_path"] = pathfmt.realpath args = [arg.format_map(kwdict) for arg in self.args] + args[0] = os.path.expanduser(args[0]) self._exec(args, False) if archive: diff --git a/gallery_dl/postprocessor/metadata.py b/gallery_dl/postprocessor/metadata.py index 714f4fef..5004bed6 100644 --- a/gallery_dl/postprocessor/metadata.py +++ b/gallery_dl/postprocessor/metadata.py @@ -124,10 +124,8 @@ class MetadataPP(PostProcessor): for key, func in self.fields.items(): obj = kwdict try: - while "[" in key: - name, _, key = key.partition("[") - obj = obj[name] - key = key.rstrip("]") + if "[" in key: + obj, key = _traverse(obj, key) obj[key] = func(kwdict) except Exception: pass @@ -137,10 +135,8 @@ class MetadataPP(PostProcessor): for key in self.fields: obj = kwdict try: - while "[" in key: - name, _, key = key.partition("[") - obj = obj[name] - key = key.rstrip("]") + if "[" in key: + obj, key = _traverse(obj, key) del obj[key] except Exception: pass @@ -214,4 +210,15 @@ class MetadataPP(PostProcessor): ) +def _traverse(obj, key): + name, _, key = key.partition("[") + obj = obj[name] + + while "[" in key: + name, _, key = key.partition("[") + obj = obj[name.strip("\"']")] + + return obj, key.strip("\"']") + + __postprocessor__ = MetadataPP diff --git a/gallery_dl/postprocessor/python.py b/gallery_dl/postprocessor/python.py new file mode 100644 index 00000000..db71da25 --- /dev/null +++ b/gallery_dl/postprocessor/python.py @@ -0,0 +1,46 @@ +# -*- coding: utf-8 -*- + +# Copyright 2023 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Run Python functions""" + +from .common import PostProcessor +from .. import util + + +class PythonPP(PostProcessor): + + def __init__(self, job, options): + PostProcessor.__init__(self, job) + + spec = options["function"] + module_name, _, function_name = spec.rpartition(":") + module = util.import_file(module_name) + self.function = getattr(module, function_name) + + if self._init_archive(job, options): + self.run = self.run_archive + + events = options.get("event") + if events is None: + events = ("file",) + elif isinstance(events, str): + events = events.split(",") + job.register_hooks({event: self.run for event in events}, options) + + def run(self, pathfmt): + self.function(pathfmt.kwdict) + + def run_archive(self, pathfmt): + kwdict = pathfmt.kwdict + if self.archive.check(kwdict): + return + self.function(kwdict) + self.archive.add(kwdict) + + +__postprocessor__ = PythonPP diff --git a/gallery_dl/version.py b/gallery_dl/version.py index b698a01b..39cfbd1c 100644 --- a/gallery_dl/version.py +++ b/gallery_dl/version.py @@ -6,4 +6,4 @@ # it under the terms of the GNU General Public License version 2 as # published by the Free Software Foundation. -__version__ = "1.25.3-dev" +__version__ = "1.26.0-dev" diff --git a/gallery_dl/ytdl.py b/gallery_dl/ytdl.py index b4638b75..0a0bf864 100644 --- a/gallery_dl/ytdl.py +++ b/gallery_dl/ytdl.py @@ -399,7 +399,7 @@ def parse_command_line(module, argv): "playlist_items": opts.playlist_items, "xattr_set_filesize": opts.xattr_set_filesize, "match_filter": match_filter, - "no_color": opts.no_color, + "no_color": getattr(opts, "no_color", None), "ffmpeg_location": opts.ffmpeg_location, "hls_prefer_native": opts.hls_prefer_native, "hls_use_mpegts": opts.hls_use_mpegts, @@ -409,9 +409,12 @@ def parse_command_line(module, argv): "postprocessor_args": opts.postprocessor_args, "cn_verification_proxy": opts.cn_verification_proxy, "geo_verification_proxy": opts.geo_verification_proxy, - "geo_bypass": opts.geo_bypass, - "geo_bypass_country": opts.geo_bypass_country, - "geo_bypass_ip_block": opts.geo_bypass_ip_block, + "geo_bypass": getattr( + opts, "geo_bypass", "default"), + "geo_bypass_country": getattr( + opts, "geo_bypass_country", None), + "geo_bypass_ip_block": getattr( + opts, "geo_bypass_ip_block", None), "compat_opts": compat_opts, } diff --git a/scripts/pull-request b/scripts/pull-request new file mode 100755 index 00000000..defdc11f --- /dev/null +++ b/scripts/pull-request @@ -0,0 +1,55 @@ +#!/bin/bash +set -e + +RE="https://github.com/([^/?#]+)/([^/?#]+)(/tree/(.+))?" +if [[ "$1" =~ $RE ]]; then + USER="${BASH_REMATCH[1]}" + REPO="${BASH_REMATCH[2]}" + BRANCH="${BASH_REMATCH[4]:-master}" + +else + echo "invalid github repository identifier: '$1'" + exit 1 + +fi + + +call() { echo "$@"; "$@"; echo; } + +# {x,,} transforms value to lowercase +case "${2,,}" in + +""|"f"|"fetch") + call git remote add "$USER" git@github.com:"$USER"/"$REPO".git || true + call git fetch "$USER" "$BRANCH" + call git checkout -b "$USER-$BRANCH" "$USER/$BRANCH" + ;; + +"m"|"merge") + RE='\s*(.+)\s+#([0-9]+)' + if [[ "$3" =~ $RE ]]; then + TITLE="${BASH_REMATCH[1]}" + PULL="${BASH_REMATCH[2]}" + fi + + call git switch master + call git merge --no-ff --edit -m "merge #${PULL-_}: ${TITLE-_}" "$USER-$BRANCH" + call git branch -d "$USER-$BRANCH" + ;; + +"p"|"push") + call git push "$USER" HEAD:"$BRANCH" + ;; + +"d"|"delete") + call git switch master + call git branch -D "$USER-$BRANCH" + call git remote remove "$USER" + ;; + +*) + echo "invalid action: '$2'" + exit 2 + ;; + +esac diff --git a/scripts/release.sh b/scripts/release.sh index c675713d..f32c796d 100755 --- a/scripts/release.sh +++ b/scripts/release.sh @@ -59,7 +59,7 @@ build-linux() { rm -rf "${VENV_PATH}" python -m virtualenv "${VENV_PATH}" - $VENV_PYTHON -m pip install requests requests[socks] yt-dlp pyyaml toml secretstorage pyinstaller + $VENV_PYTHON -m pip install requests requests[socks] yt-dlp pyyaml secretstorage pyinstaller $VENV_PYTHON ./scripts/pyinstaller.py } diff --git a/scripts/supportedsites.py b/scripts/supportedsites.py index 5c64bca8..45a8266f 100755 --- a/scripts/supportedsites.py +++ b/scripts/supportedsites.py @@ -28,7 +28,6 @@ CATEGORY_MAP = { "b4k" : "arch.b4k.co", "baraag" : "baraag", "bbc" : "BBC", - "bcy" : "半次元", "comicvine" : "Comic Vine", "coomerparty" : "Coomer", "deviantart" : "DeviantArt", @@ -36,6 +35,7 @@ CATEGORY_MAP = { "dynastyscans" : "Dynasty Reader", "e621" : "e621", "e926" : "e926", + "e6ai" : "e6AI", "erome" : "EroMe", "e-hentai" : "E-Hentai", "exhentai" : "ExHentai", @@ -63,9 +63,10 @@ CATEGORY_MAP = { "imgth" : "imgth", "imgur" : "imgur", "joyreactor" : "JoyReactor", + "itchio" : "itch.io", + "jpgfish" : "JPG Fish", "kabeuchi" : "かべうち", "kemonoparty" : "Kemono", - "lineblog" : "LINE BLOG", "livedoor" : "livedoor Blog", "ohpolly" : "Oh Polly", "omgmiamiswimwear": "Omg Miami Swimwear", @@ -76,6 +77,7 @@ CATEGORY_MAP = { "mangalife" : "MangaLife", "manganelo" : "Manganato", "mangapark" : "MangaPark", + "mangaread" : "MangaRead", "mangasee" : "MangaSee", "mastodon.social": "mastodon.social", "mememuseum" : "meme.museum", @@ -116,10 +118,9 @@ CATEGORY_MAP = { "thatpervert" : "ThatPervert", "thebarchive" : "The /b/ Archive", "thecollection" : "The /co/llection", - "tokyochronos" : "TokyoChronos", "tumblrgallery" : "TumblrGallery", "vanillarock" : "もえぴりあ", - "vidyart" : "/v/idyart", + "vidyart2" : "/v/idyart2", "vk" : "VK", "vsco" : "VSCO", "wallpapercave" : "Wallpaper Cave", @@ -184,6 +185,9 @@ SUBCATEGORY_MAP = { "hentaifoundry": { "story": "", }, + "imgur": { + "favorite-folder": "Favorites Folders", + }, "instagram": { "posts": "", "saved": "Saved Posts", @@ -193,6 +197,9 @@ SUBCATEGORY_MAP = { "discord": "Discord Servers", "discord-server": "", }, + "lensdump": { + "albums": "", + }, "mangadex": { "feed" : "Followed Feed", }, @@ -211,6 +218,9 @@ SUBCATEGORY_MAP = { }, "pixiv": { "me" : "pixiv.me Links", + "novel-bookmark": "Novel Bookmarks", + "novel-series": "Novel Series", + "novel-user": "", "pixivision": "pixivision", "sketch": "Sketch", "work": "individual Images", @@ -230,6 +240,9 @@ SUBCATEGORY_MAP = { "smugmug": { "path": "Images from Users and Folders", }, + "tumblr": { + "day": "Days", + }, "twitter": { "media": "Media Timelines", "tweets": "", @@ -265,6 +278,7 @@ BASE_MAP = { "foolslide" : "FoOlSlide Instances", "gelbooru_v01": "Gelbooru Beta 0.1.11", "gelbooru_v02": "Gelbooru Beta 0.2", + "jschan" : "jschan Imageboards", "lolisafe" : "lolisafe and chibisafe", "lynxchan" : "LynxChan Imageboards", "moebooru" : "Moebooru and MyImouto", @@ -298,6 +312,7 @@ AUTH_MAP = { "fanbox" : _COOKIES, "fantia" : _COOKIES, "flickr" : _OAUTH, + "gfycat" : "Supported", "furaffinity" : _COOKIES, "horne" : "Required", "idolcomplex" : "Supported", @@ -326,8 +341,10 @@ AUTH_MAP = { "tsumino" : "Supported", "tumblr" : _OAUTH, "twitter" : "Supported", + "vipergirls" : "Supported", "wallhaven" : _APIKEY_WH, "weasyl" : _APIKEY_WY, + "zerochan" : "Supported", } IGNORE_LIST = ( diff --git a/setup.cfg b/setup.cfg index 521edc5d..56d71087 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,4 @@ ignore = E203,E226,W504 per-file-ignores = setup.py: E501 gallery_dl/extractor/500px.py: E501 + gallery_dl/extractor/mangapark.py: E501 diff --git a/test/test_cookies.py b/test/test_cookies.py index 335fa3dd..a6ad05f1 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2017-2022 Mike Fährmann +# Copyright 2017-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -46,8 +46,7 @@ class TestCookiejar(unittest.TestCase): def test_cookiefile(self): config.set((), "cookies", self.cookiefile) - - cookies = extractor.find("test:").session.cookies + cookies = _get_extractor("test").cookies self.assertEqual(len(cookies), 1) cookie = next(iter(cookies)) @@ -65,12 +64,14 @@ class TestCookiejar(unittest.TestCase): def _test_warning(self, filename, exc): config.set((), "cookies", filename) log = logging.getLogger("test") + with mock.patch.object(log, "warning") as mock_warning: - cookies = extractor.find("test:").session.cookies - self.assertEqual(len(cookies), 0) - self.assertEqual(mock_warning.call_count, 1) - self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") - self.assertIsInstance(mock_warning.call_args[0][1], exc) + cookies = _get_extractor("test").cookies + + self.assertEqual(len(cookies), 0) + self.assertEqual(mock_warning.call_count, 1) + self.assertEqual(mock_warning.call_args[0][0], "cookies: %s") + self.assertIsInstance(mock_warning.call_args[0][1], exc) class TestCookiedict(unittest.TestCase): @@ -83,7 +84,8 @@ class TestCookiedict(unittest.TestCase): config.clear() def test_dict(self): - cookies = extractor.find("test:").session.cookies + cookies = _get_extractor("test").cookies + self.assertEqual(len(cookies), len(self.cdict)) self.assertEqual(sorted(cookies.keys()), sorted(self.cdict.keys())) self.assertEqual(sorted(cookies.values()), sorted(self.cdict.values())) @@ -91,11 +93,11 @@ class TestCookiedict(unittest.TestCase): def test_domain(self): for category in ["exhentai", "idolcomplex", "nijie", "horne"]: extr = _get_extractor(category) - cookies = extr.session.cookies + cookies = extr.cookies for key in self.cdict: self.assertTrue(key in cookies) for c in cookies: - self.assertEqual(c.domain, extr.cookiedomain) + self.assertEqual(c.domain, extr.cookies_domain) class TestCookieLogin(unittest.TestCase): @@ -122,91 +124,96 @@ class TestCookieLogin(unittest.TestCase): class TestCookieUtils(unittest.TestCase): def test_check_cookies(self): - extr = extractor.find("test:") - self.assertFalse(extr._cookiejar, "empty") - self.assertFalse(extr.cookiedomain, "empty") + extr = _get_extractor("test") + self.assertFalse(extr.cookies, "empty") + self.assertFalse(extr.cookies_domain, "empty") # always returns False when checking for empty cookie list - self.assertFalse(extr._check_cookies(())) + self.assertFalse(extr.cookies_check(())) - self.assertFalse(extr._check_cookies(("a",))) - self.assertFalse(extr._check_cookies(("a", "b"))) - self.assertFalse(extr._check_cookies(("a", "b", "c"))) + self.assertFalse(extr.cookies_check(("a",))) + self.assertFalse(extr.cookies_check(("a", "b"))) + self.assertFalse(extr.cookies_check(("a", "b", "c"))) - extr._cookiejar.set("a", "1") - self.assertTrue(extr._check_cookies(("a",))) - self.assertFalse(extr._check_cookies(("a", "b"))) - self.assertFalse(extr._check_cookies(("a", "b", "c"))) + extr.cookies.set("a", "1") + self.assertTrue(extr.cookies_check(("a",))) + self.assertFalse(extr.cookies_check(("a", "b"))) + self.assertFalse(extr.cookies_check(("a", "b", "c"))) - extr._cookiejar.set("b", "2") - self.assertTrue(extr._check_cookies(("a",))) - self.assertTrue(extr._check_cookies(("a", "b"))) - self.assertFalse(extr._check_cookies(("a", "b", "c"))) + extr.cookies.set("b", "2") + self.assertTrue(extr.cookies_check(("a",))) + self.assertTrue(extr.cookies_check(("a", "b"))) + self.assertFalse(extr.cookies_check(("a", "b", "c"))) def test_check_cookies_domain(self): - extr = extractor.find("test:") - self.assertFalse(extr._cookiejar, "empty") - extr.cookiedomain = ".example.org" + extr = _get_extractor("test") + self.assertFalse(extr.cookies, "empty") + extr.cookies_domain = ".example.org" - self.assertFalse(extr._check_cookies(("a",))) - self.assertFalse(extr._check_cookies(("a", "b"))) + self.assertFalse(extr.cookies_check(("a",))) + self.assertFalse(extr.cookies_check(("a", "b"))) - extr._cookiejar.set("a", "1") - self.assertFalse(extr._check_cookies(("a",))) + extr.cookies.set("a", "1") + self.assertFalse(extr.cookies_check(("a",))) - extr._cookiejar.set("a", "1", domain=extr.cookiedomain) - self.assertTrue(extr._check_cookies(("a",))) + extr.cookies.set("a", "1", domain=extr.cookies_domain) + self.assertTrue(extr.cookies_check(("a",))) - extr._cookiejar.set("a", "1", domain="www" + extr.cookiedomain) - self.assertEqual(len(extr._cookiejar), 3) - self.assertTrue(extr._check_cookies(("a",))) + extr.cookies.set("a", "1", domain="www" + extr.cookies_domain) + self.assertEqual(len(extr.cookies), 3) + self.assertTrue(extr.cookies_check(("a",))) - extr._cookiejar.set("b", "2", domain=extr.cookiedomain) - extr._cookiejar.set("c", "3", domain=extr.cookiedomain) - self.assertTrue(extr._check_cookies(("a", "b", "c"))) + extr.cookies.set("b", "2", domain=extr.cookies_domain) + extr.cookies.set("c", "3", domain=extr.cookies_domain) + self.assertTrue(extr.cookies_check(("a", "b", "c"))) def test_check_cookies_expires(self): - extr = extractor.find("test:") - self.assertFalse(extr._cookiejar, "empty") - self.assertFalse(extr.cookiedomain, "empty") + extr = _get_extractor("test") + self.assertFalse(extr.cookies, "empty") + self.assertFalse(extr.cookies_domain, "empty") now = int(time.time()) log = logging.getLogger("test") - extr._cookiejar.set("a", "1", expires=now-100) + extr.cookies.set("a", "1", expires=now-100) with mock.patch.object(log, "warning") as mw: - self.assertFalse(extr._check_cookies(("a",))) + self.assertFalse(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ("Cookie '%s' has expired", "a")) - extr._cookiejar.set("a", "1", expires=now+100) + extr.cookies.set("a", "1", expires=now+100) with mock.patch.object(log, "warning") as mw: - self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ( "Cookie '%s' will expire in less than %s hour%s", "a", 1, "")) - extr._cookiejar.set("a", "1", expires=now+100+7200) + extr.cookies.set("a", "1", expires=now+100+7200) with mock.patch.object(log, "warning") as mw: - self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 1) self.assertEqual(mw.call_args[0], ( "Cookie '%s' will expire in less than %s hour%s", "a", 3, "s")) - extr._cookiejar.set("a", "1", expires=now+100+24*3600) + extr.cookies.set("a", "1", expires=now+100+24*3600) with mock.patch.object(log, "warning") as mw: - self.assertTrue(extr._check_cookies(("a",))) + self.assertTrue(extr.cookies_check(("a",))) self.assertEqual(mw.call_count, 0) def _get_extractor(category): - URLS = { - "exhentai" : "https://exhentai.org/g/1200119/d55c44d3d0/", - "idolcomplex": "https://idol.sankakucomplex.com/post/show/1", - "nijie" : "https://nijie.info/view.php?id=1", - "horne" : "https://horne.red/view.php?id=1", - } - return extractor.find(URLS[category]) + extr = extractor.find(URLS[category]) + extr.initialize() + return extr + + +URLS = { + "exhentai" : "https://exhentai.org/g/1200119/d55c44d3d0/", + "idolcomplex": "https://idol.sankakucomplex.com/post/show/1", + "nijie" : "https://nijie.info/view.php?id=1", + "horne" : "https://horne.red/view.php?id=1", + "test" : "test:", +} if __name__ == "__main__": diff --git a/test/test_downloader.py b/test/test_downloader.py index c65be952..840e0780 100644 --- a/test/test_downloader.py +++ b/test/test_downloader.py @@ -34,6 +34,7 @@ class FakeJob(): def __init__(self): self.extractor = extractor.find("test:") + self.extractor.initialize() self.pathfmt = path.PathFormat(self.extractor) self.out = output.NullOutput() self.get_logger = logging.getLogger diff --git a/test/test_extractor.py b/test/test_extractor.py index 6516fa8f..f8bed133 100644 --- a/test/test_extractor.py +++ b/test/test_extractor.py @@ -132,8 +132,30 @@ class TestExtractorModule(unittest.TestCase): else: self.assertIs(extr1, matches[0][1], url) + def test_init(self): + """Test for exceptions in Extractor.initialize(()""" + for cls in extractor.extractors(): + if cls.category == "ytdl": + continue + for test in cls._get_tests(): + extr = cls.from_url(test[0]) + extr.initialize() + extr.finalize() + break + + def test_init_ytdl(self): + try: + extr = extractor.find("ytdl:") + extr.initialize() + extr.finalize() + except ImportError as exc: + if exc.name in ("youtube_dl", "yt_dlp"): + raise unittest.SkipTest("cannot import module '{}'".format( + exc.name)) + raise + def test_docstrings(self): - """ensure docstring uniqueness""" + """Ensure docstring uniqueness""" for extr1 in extractor.extractors(): for extr2 in extractor.extractors(): if extr1 != extr2 and extr1.__doc__ and extr2.__doc__: diff --git a/test/test_formatter.py b/test/test_formatter.py index 22589668..0992f4ba 100644 --- a/test/test_formatter.py +++ b/test/test_formatter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -# Copyright 2021-2022 Mike Fährmann +# Copyright 2021-2023 Mike Fährmann # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 as @@ -23,10 +23,12 @@ class TestFormatter(unittest.TestCase): kwdict = { "a": "hElLo wOrLd", "b": "äöü", + "j": "げんそうきょう", "d": {"a": "foo", "b": 0, "c": None}, "l": ["a", "b", "c"], "n": None, "s": " \n\r\tSPACE ", + "h": "

    foo

    & bar

    ", "u": "'< / >'", "t": 1262304000, "dt": datetime.datetime(2010, 1, 1), @@ -46,6 +48,10 @@ class TestFormatter(unittest.TestCase): self._run_test("{s!t}", "SPACE") self._run_test("{a!U}", self.kwdict["a"]) self._run_test("{u!U}", "'< / >'") + self._run_test("{a!H}", self.kwdict["a"]) + self._run_test("{h!H}", "foo & bar") + self._run_test("{u!H}", "'< / >'") + self._run_test("{n!H}", "") self._run_test("{a!s}", self.kwdict["a"]) self._run_test("{a!r}", "'" + self.kwdict["a"] + "'") self._run_test("{a!a}", "'" + self.kwdict["a"] + "'") @@ -133,7 +139,7 @@ class TestFormatter(unittest.TestCase): self._run_test("{d['a']}", "foo") self._run_test('{d["a"]}', "foo") - def test_slicing(self): + def test_slice_str(self): v = self.kwdict["a"] self._run_test("{a[1:10]}" , v[1:10]) self._run_test("{a[-10:-1]}", v[-10:-1]) @@ -165,6 +171,26 @@ class TestFormatter(unittest.TestCase): self._run_test("{a:[:50:2]}", v[:50:2]) self._run_test("{a:[::]}" , v) + def test_slice_bytes(self): + v = self.kwdict["j"] + self._run_test("{j[b1:10]}" , v[1:3]) + self._run_test("{j[b-10:-1]}", v[-3:-1]) + self._run_test("{j[b5:]}" , v[2:]) + self._run_test("{j[b50:]}" , v[50:]) + self._run_test("{j[b:5]}" , v[:1]) + self._run_test("{j[b:50]}" , v[:50]) + self._run_test("{j[b:]}" , v) + self._run_test("{j[b::]}" , v) + + self._run_test("{j:[b1:10]}" , v[1:3]) + self._run_test("{j:[b-10:-1]}", v[-3:-1]) + self._run_test("{j:[b5:]}" , v[2:]) + self._run_test("{j:[b50:]}" , v[50:]) + self._run_test("{j:[b:5]}" , v[:1]) + self._run_test("{j:[b:50]}" , v[:50]) + self._run_test("{j:[b:]}" , v) + self._run_test("{j:[b::]}" , v) + def test_maxlen(self): v = self.kwdict["a"] self._run_test("{a:L5/foo/}" , "foo") @@ -413,10 +439,10 @@ def noarg(): fmt4 = formatter.parse("\fM " + path + ":lengths") self.assertEqual(fmt1.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt2.format_map(self.kwdict), "89") + self.assertEqual(fmt2.format_map(self.kwdict), "126") self.assertEqual(fmt3.format_map(self.kwdict), "'Title' by Name") - self.assertEqual(fmt4.format_map(self.kwdict), "89") + self.assertEqual(fmt4.format_map(self.kwdict), "126") with self.assertRaises(TypeError): self.assertEqual(fmt0.format_map(self.kwdict), "") diff --git a/test/test_postprocessor.py b/test/test_postprocessor.py index c78d7b03..bcabdc8c 100644 --- a/test/test_postprocessor.py +++ b/test/test_postprocessor.py @@ -164,6 +164,76 @@ class ClassifyTest(BasePostprocessorTest): mkdirs.assert_called_once_with(path, exist_ok=True) +class ExecTest(BasePostprocessorTest): + + def test_command_string(self): + self._create({ + "command": "echo {} && rm {};", + }) + + with patch("subprocess.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + p.assert_called_once_with( + "echo {0} && rm {0};".format(self.pathfmt.realpath), shell=True) + i.wait.assert_called_once_with() + + def test_command_list(self): + self._create({ + "command": ["~/script.sh", "{category}", + "\fE _directory.upper()"], + }) + + with patch("subprocess.Popen") as p: + i = Mock() + i.wait.return_value = 0 + p.return_value = i + self._trigger(("after",)) + + p.assert_called_once_with( + [ + os.path.expanduser("~/script.sh"), + self.pathfmt.kwdict["category"], + self.pathfmt.realdirectory.upper(), + ], + shell=False, + ) + + def test_command_returncode(self): + self._create({ + "command": "echo {}", + }) + + with patch("subprocess.Popen") as p: + i = Mock() + i.wait.return_value = 123 + p.return_value = i + + with self.assertLogs() as log: + self._trigger(("after",)) + + msg = ("WARNING:postprocessor.exec:'echo {}' returned with " + "non-zero exit status (123)".format(self.pathfmt.realpath)) + self.assertEqual(log.output[0], msg) + + def test_async(self): + self._create({ + "async" : True, + "command": "echo {}", + }) + + with patch("subprocess.Popen") as p: + i = Mock() + p.return_value = i + self._trigger(("after",)) + + self.assertTrue(p.called) + self.assertFalse(i.wait.called) + + class MetadataTest(BasePostprocessorTest): def test_metadata_default(self): @@ -388,51 +458,60 @@ class MetadataTest(BasePostprocessorTest): """) def test_metadata_modify(self): - kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}} + kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3, "ba2": {}}} self._create({ "mode": "modify", "fields": { - "foo" : "{filename}-{foo!s}", - "foo2" : "\fE bar['bax'] + 122", - "bar[baz]": "{_now}", - "bar[ba2]": "test", + "foo" : "{filename}-{foo!s}", + "foo2" : "\fE bar['bax'] + 122", + "bar[\"baz\"]" : "{_now}", + "bar['ba2'][a]": "test", }, }, kwdict) - pdict = self.pathfmt.kwdict + pdict = self.pathfmt.kwdict self.assertIsNot(kwdict, pdict) self.assertEqual(pdict["foo"], kwdict["foo"]) self.assertEqual(pdict["bar"], kwdict["bar"]) self._trigger() - self.assertEqual(pdict["foo"] , "file-0") - self.assertEqual(pdict["foo2"] , 123) - self.assertEqual(pdict["bar"]["ba2"], "test") + self.assertEqual(pdict["foo"] , "file-0") + self.assertEqual(pdict["foo2"], 123) + self.assertEqual(pdict["bar"]["ba2"]["a"], "test") self.assertIsInstance(pdict["bar"]["baz"], datetime) def test_metadata_delete(self): - kwdict = {"foo": 0, "bar": {"bax": 1, "bay": 2, "baz": 3}} - self._create({"mode": "delete", "fields": ["foo", "bar[baz]"]}, kwdict) - pdict = self.pathfmt.kwdict + kwdict = { + "foo": 0, + "bar": { + "bax": 1, + "bay": 2, + "baz": {"a": 3, "b": 4}, + }, + } + self._create({ + "mode": "delete", + "fields": ["foo", "bar['bax']", "bar[\"baz\"][a]"], + }, kwdict) + pdict = self.pathfmt.kwdict self.assertIsNot(kwdict, pdict) + self.assertEqual(pdict["foo"], kwdict["foo"]) self.assertEqual(pdict["bar"], kwdict["bar"]) - del kwdict["foo"] - del kwdict["bar"]["baz"] - self._trigger() + self.assertNotIn("foo", pdict) - self.assertNotIn("baz", pdict["bar"]) - self.assertEqual(kwdict["bar"], pdict["bar"]) + self.assertNotIn("bax", pdict["bar"]) + self.assertNotIn("a", pdict["bar"]["baz"]) # no errors for deleted/undefined fields self._trigger() self.assertNotIn("foo", pdict) - self.assertNotIn("baz", pdict["bar"]) - self.assertEqual(kwdict["bar"], pdict["bar"]) + self.assertNotIn("bax", pdict["bar"]) + self.assertNotIn("a", pdict["bar"]["baz"]) def test_metadata_option_skip(self): self._create({"skip": True}) @@ -500,6 +579,40 @@ class MtimeTest(BasePostprocessorTest): self.assertEqual(self.pathfmt.kwdict["_mtime"], 315532800) +class PythonTest(BasePostprocessorTest): + + def test_module(self): + path = os.path.join(self.dir.name, "module.py") + self._write_module(path) + + sys.path.insert(0, self.dir.name) + try: + self._create({"function": "module:calc"}, {"_value": 123}) + finally: + del sys.path[0] + + self.assertNotIn("_result", self.pathfmt.kwdict) + self._trigger() + self.assertEqual(self.pathfmt.kwdict["_result"], 246) + + def test_path(self): + path = os.path.join(self.dir.name, "module.py") + self._write_module(path) + + self._create({"function": path + ":calc"}, {"_value": 12}) + + self.assertNotIn("_result", self.pathfmt.kwdict) + self._trigger() + self.assertEqual(self.pathfmt.kwdict["_result"], 24) + + def _write_module(self, path): + with open(path, "w") as fp: + fp.write(""" +def calc(kwdict): + kwdict["_result"] = kwdict["_value"] * 2 +""") + + class ZipTest(BasePostprocessorTest): def test_zip_default(self): diff --git a/test/test_results.py b/test/test_results.py index d28496b3..f434b2fc 100644 --- a/test/test_results.py +++ b/test/test_results.py @@ -322,9 +322,11 @@ def setup_test_config(): config.set(("extractor", "mangoxo") , "username", "LiQiang3") config.set(("extractor", "mangoxo") , "password", "5zbQF10_5u25259Ma") - for category in ("danbooru", "atfbooru", "aibooru", "e621", "e926", + for category in ("danbooru", "atfbooru", "aibooru", "booruvar", + "e621", "e926", "e6ai", "instagram", "twitter", "subscribestar", "deviantart", - "inkbunny", "tapas", "pillowfort", "mangadex"): + "inkbunny", "tapas", "pillowfort", "mangadex", + "vipergirls", "gfycat"): config.set(("extractor", category), "username", None) config.set(("extractor", "mastodon.social"), "access-token", diff --git a/test/test_ytdl.py b/test/test_ytdl.py index 7b82a0f8..878ac85b 100644 --- a/test/test_ytdl.py +++ b/test/test_ytdl.py @@ -269,6 +269,31 @@ class Test_CommandlineArguments_YtDlp(Test_CommandlineArguments): "title:%(artist)s - %(title)s")], }) + def test_geo_bypass(self): + try: + ytdl.parse_command_line(self.module, ["--xff", "default"]) + except Exception: + # before --xff (c16644642) + return Test_CommandlineArguments.test_geo_bypass(self) + + self._(["--xff", "default"], + "geo_bypass", "default") + self._(["--xff", "never"], + "geo_bypass", "never") + self._(["--xff", "EN"], + "geo_bypass", "EN") + self._(["--xff", "198.51.100.14/24"], + "geo_bypass", "198.51.100.14/24") + + self._("--geo-bypass", + "geo_bypass", "default") + self._("--no-geo-bypass", + "geo_bypass", "never") + self._(["--geo-bypass-country", "EN"], + "geo_bypass", "EN") + self._(["--geo-bypass-ip-block", "198.51.100.14/24"], + "geo_bypass", "198.51.100.14/24") + if __name__ == "__main__": unittest.main(warnings="ignore")