1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 02:32:33 +01:00

Merge branch 'mikf:master' into feature/patreonPostComments

This commit is contained in:
Krystian Owoc 2024-05-17 17:11:13 +02:00 committed by GitHub
commit e6c948f425
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
121 changed files with 2649 additions and 777 deletions

View File

@ -1,33 +1,47 @@
name: docker name: Docker Images
on: on:
workflow_dispatch: workflow_dispatch:
push: push:
branches:
- master
tags: tags:
- v[0-9]+.[0-9]+.[0-9]+ - v[0-9]+.[0-9]+.[0-9]+
permissions: permissions:
packages: write packages: write
concurrency:
group: docker
cancel-in-progress: false
jobs: jobs:
docker: build:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# on release commits, run only for tag event
if: ${{ ! startsWith( github.event.head_commit.message , 'release version ' ) || startsWith( github.ref , 'refs/tags/v' ) }}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
# https://github.com/docker/setup-buildx-action - uses: docker/metadata-action@v5
- name: Set up Docker Buildx id: metadata
uses: docker/setup-buildx-action@v3
# https://github.com/docker/login-action
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with: with:
registry: ghcr.io images: |
username: ${{ github.repository_owner }} mikf123/gallery-dl
password: ${{ secrets.GHCR_TOKEN }} ghcr.io/mikf/gallery-dl
tags: |
type=ref,event=tag
type=raw,value=dev
type=sha,format=long,prefix=
type=raw,priority=500,value={{date 'YYYY.MM.DD'}}
- uses: docker/setup-qemu-action@v3
- uses: docker/setup-buildx-action@v3
- name: Login to DockerHub - name: Login to DockerHub
uses: docker/login-action@v3 uses: docker/login-action@v3
@ -35,23 +49,17 @@ jobs:
username: ${{ secrets.DOCKERHUB_USERNAME }} username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }} password: ${{ secrets.DOCKERHUB_TOKEN }}
# https://github.com/docker/metadata-action - name: Login to GitHub Container Registry
- name: Generate Docker tags uses: docker/login-action@v3
uses: docker/metadata-action@v5
id: metadata
with: with:
images: | registry: ghcr.io
mikf123/gallery-dl username: ${{ github.repository_owner }}
ghcr.io/mikf/gallery-dl password: ${{ secrets.GHCR_TOKEN }}
tags: |
type=sha,format=long,prefix=
type=ref,event=tag
# https://github.com/docker/build-push-action - uses: docker/build-push-action@v5
- name: Build image
uses: docker/build-push-action@v5
with: with:
context: .
push: true push: true
tags: ${{ steps.metadata.outputs.tags }} tags: ${{ steps.metadata.outputs.tags }}
labels: ${{ steps.metadata.outputs.labels }} labels: ${{ steps.metadata.outputs.labels }}
platforms: linux/amd64 platforms: linux/amd64,linux/arm64

View File

@ -1,10 +1,15 @@
name: executables name: Executables
on: on:
workflow_dispatch: workflow_dispatch:
push: push:
branches: branches:
- master - master
tags-ignore:
- "*"
env:
DATE_FORMAT: "%Y.%m.%d"
jobs: jobs:
build: build:
@ -31,19 +36,58 @@ jobs:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }} - name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }}
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}
architecture: ${{ matrix.architecture }} architecture: ${{ matrix.architecture }}
- name: Date
run: echo "DATE=$(date '+${{ env.DATE_FORMAT }}')" >> "$GITHUB_ENV"
- name: Update Version
# use Python since its behavior is consistent across operating systems
shell: python
run: |
import re
path = "./gallery_dl/version.py"
with open(path) as fp:
content = fp.read()
content = re.sub(
r'\b(__version__ = "[^"]+)',
r"\1:${{ env.DATE }}",
content)
with open(path, "w") as fp:
fp.write(content)
- name: Build executable - name: Build executable
run: | run: |
pip install requests requests[socks] yt-dlp pyyaml ${{ matrix.python-packages }} pyinstaller pip install requests requests[socks] yt-dlp pyyaml ${{ matrix.python-packages }} pyinstaller
python scripts/pyinstaller.py python ./scripts/pyinstaller.py --os '${{ matrix.os }}' --arch '${{ matrix.architecture }}'
- name: Upload executable - uses: actions/upload-artifact@v4
uses: actions/upload-artifact@v3
with: with:
name: gallery-dl-${{ matrix.os }}-${{ matrix.architecture }}-${{ matrix.python-version }} name: executable-${{ matrix.os }}-${{ matrix.architecture }}-${{ matrix.python-version }}
path: | path: dist/*
dist retention-days: 1
compression-level: 0
release:
needs: build
runs-on: ubuntu-latest
steps:
- uses: actions/download-artifact@v4
- name: Date
run: echo "DATE=$(date '+${{ env.DATE_FORMAT }}')" >> "$GITHUB_ENV"
- uses: ncipollo/release-action@v1
with:
owner: gdl-org
repo: builds
tag: ${{ env.DATE }}
artifacts: "executable-*/*"
allowUpdates: true
makeLatest: true
token: ${{ secrets.REPO_TOKEN }}

56
.github/workflows/pages.yml vendored Normal file
View File

@ -0,0 +1,56 @@
name: GitHub Pages
on:
workflow_dispatch:
push:
branches:
- master
paths:
- docs/**
permissions:
contents: read
pages: write
id-token: write
concurrency:
group: pages
cancel-in-progress: false
jobs:
dispatch:
runs-on: ubuntu-latest
steps:
- name: Dispatch to gdl-org/docs
run: >
curl -L
-X POST
-H "Accept: application/vnd.github+json"
-H "Authorization: Bearer ${{ secrets.REPO_TOKEN }}"
-H "X-GitHub-Api-Version: 2022-11-28"
https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches
-d '{"ref":"master"}'
deploy:
runs-on: ubuntu-latest
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- uses: actions/checkout@v4
- uses: actions/configure-pages@v4
- name: Copy static files
run: |
mkdir --parents -- ./_site
cp --archive --target-directory=./_site -- \
./docs/oauth-redirect.html
- uses: actions/upload-pages-artifact@v3
- uses: actions/deploy-pages@v4
id: deployment

View File

@ -1,23 +0,0 @@
name: Dispatch GitHub Pages Build
on:
workflow_dispatch:
push:
branches:
- "master"
paths:
- "docs/**"
jobs:
dispatch:
runs-on: ubuntu-latest
steps:
- name: dispatch
run: >
curl -L
-X POST
-H "Accept: application/vnd.github+json"
-H "Authorization: Bearer ${{ secrets.DISPATCH_TOKEN }}"
-H "X-GitHub-Api-Version: 2022-11-28"
https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches
-d '{"ref":"master"}'

View File

@ -16,7 +16,7 @@ jobs:
strategy: strategy:
fail-fast: false fail-fast: false
matrix: matrix:
python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4
@ -26,7 +26,7 @@ jobs:
if [[ "$(find ./gallery_dl -type f -not -perm 644)" ]]; then exit 1; fi if [[ "$(find ./gallery_dl -type f -not -perm 644)" ]]; then exit 1; fi
- name: Set up Python ${{ matrix.python-version }} - name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4 uses: actions/setup-python@v5
with: with:
python-version: ${{ matrix.python-version }} python-version: ${{ matrix.python-version }}

View File

@ -1,5 +1,89 @@
# Changelog # Changelog
## 1.26.9 - 2024-03-23
### Extractors
#### Additions
- [artstation] support video clips ([#2566](https://github.com/mikf/gallery-dl/issues/2566), [#3309](https://github.com/mikf/gallery-dl/issues/3309), [#3911](https://github.com/mikf/gallery-dl/issues/3911))
- [artstation] support collections ([#146](https://github.com/mikf/gallery-dl/issues/146))
- [deviantart] recognize `deviantart.com/stash/…` URLs
- [idolcomplex] support new pool URLs
- [lensdump] recognize direct image links ([#5293](https://github.com/mikf/gallery-dl/issues/5293))
- [skeb] add extractor for followed users ([#5290](https://github.com/mikf/gallery-dl/issues/5290))
- [twitter] add `quotes` extractor ([#5262](https://github.com/mikf/gallery-dl/issues/5262))
- [wikimedia] support `azurlane.koumakan.jp` ([#5256](https://github.com/mikf/gallery-dl/issues/5256))
- [xvideos] support `/channels/` URLs ([#5244](https://github.com/mikf/gallery-dl/issues/5244))
#### Fixes
- [artstation] fix handling usernames with dashes in domain names ([#5224](https://github.com/mikf/gallery-dl/issues/5224))
- [bluesky] fix not spawning child extractors for followed users ([#5246](https://github.com/mikf/gallery-dl/issues/5246))
- [deviantart] handle CloudFront blocks ([#5363](https://github.com/mikf/gallery-dl/issues/5363))
- [deviantart:avatar] fix `index` for URLs without `?` ([#5276](https://github.com/mikf/gallery-dl/issues/5276))
- [deviantart:stash] fix `index` values ([#5335](https://github.com/mikf/gallery-dl/issues/5335))
- [gofile] fix extraction
- [hiperdex] update URL patterns & fix `manga` metadata ([#5340](https://github.com/mikf/gallery-dl/issues/5340))
- [idolcomplex] fix metadata extraction
- [imagefap] fix folder extraction ([#5333](https://github.com/mikf/gallery-dl/issues/5333))
- [instagram] make accessing `like_count` non-fatal ([#5218](https://github.com/mikf/gallery-dl/issues/5218))
- [mastodon] fix handling null `moved` account field ([#5321](https://github.com/mikf/gallery-dl/issues/5321))
- [naver] fix EUC-KR encoding issue in old image URLs ([#5126](https://github.com/mikf/gallery-dl/issues/5126))
- [nijie] increase default delay between requests ([#5221](https://github.com/mikf/gallery-dl/issues/5221))
- [nitter] ignore invalid Tweets ([#5253](https://github.com/mikf/gallery-dl/issues/5253))
- [pixiv:novel] fix text extraction ([#5285](https://github.com/mikf/gallery-dl/issues/5285), [#5309](https://github.com/mikf/gallery-dl/issues/5309))
- [skeb] retry 429 responses containing a `request_key` cookie ([#5210](https://github.com/mikf/gallery-dl/issues/5210))
- [warosu] fix crash for threads with deleted posts ([#5289](https://github.com/mikf/gallery-dl/issues/5289))
- [weibo] fix retweets ([#2825](https://github.com/mikf/gallery-dl/issues/2825), [#3874](https://github.com/mikf/gallery-dl/issues/3874), [#5263](https://github.com/mikf/gallery-dl/issues/5263))
- [weibo] fix `livephoto` filename extensions ([#5287](https://github.com/mikf/gallery-dl/issues/5287))
- [xvideos] fix galleries with more than 500 images ([#5244](https://github.com/mikf/gallery-dl/issues/5244))
#### Improvements
- [bluesky] improve API error messages
- [bluesky] handle posts with different `embed` structure
- [deviantart:avatar] ignore default avatars ([#5276](https://github.com/mikf/gallery-dl/issues/5276))
- [fapello] download full-sized images ([#5349](https://github.com/mikf/gallery-dl/issues/5349))
- [gelbooru:favorite] automatically detect returned post order ([#5220](https://github.com/mikf/gallery-dl/issues/5220))
- [imgur] fail downloads when redirected to `removed.png` ([#5308](https://github.com/mikf/gallery-dl/issues/5308))
- [instagram] raise proper error for missing `reels_media` ([#5257](https://github.com/mikf/gallery-dl/issues/5257))
- [instagram] change `posts are private` exception to a warning ([#5322](https://github.com/mikf/gallery-dl/issues/5322))
- [reddit] improve preview fallback formats ([#5296](https://github.com/mikf/gallery-dl/issues/5296), [#5315](https://github.com/mikf/gallery-dl/issues/5315))
- [steamgriddb] raise exception for deleted assets
- [twitter] handle "account is temporarily locked" errors ([#5300](https://github.com/mikf/gallery-dl/issues/5300))
- [weibo] rework pagination logic ([#4168](https://github.com/mikf/gallery-dl/issues/4168))
- [zerochan] fetch more posts by using the API ([#3669](https://github.com/mikf/gallery-dl/issues/3669))
#### Metadata
- [bluesky] add `instance` metadata field ([#4438](https://github.com/mikf/gallery-dl/issues/4438))
- [gelbooru:favorite] add `date_favorited` metadata field
- [imagefap] extract `folder` metadata ([#5270](https://github.com/mikf/gallery-dl/issues/5270))
- [instagram] default `likes` to `0` ([#5323](https://github.com/mikf/gallery-dl/issues/5323))
- [kemonoparty] add `revision_count` metadata field ([#5334](https://github.com/mikf/gallery-dl/issues/5334))
- [naver] unescape post `title` and `description`
- [pornhub:gif] extract `viewkey` and `timestamp` metadata ([#4463](https://github.com/mikf/gallery-dl/issues/4463))
- [redgifs] make `date` available for directories ([#5262](https://github.com/mikf/gallery-dl/issues/5262))
- [subscribestar] fix `date` metadata
- [twitter] add `birdwatch` metadata field ([#5317](https://github.com/mikf/gallery-dl/issues/5317))
- [twitter] add `protected` metadata field ([#5327](https://github.com/mikf/gallery-dl/issues/5327))
- [warosu] fix `board_name` metadata
#### Options
- [bluesky] add `reposts` option ([#4438](https://github.com/mikf/gallery-dl/issues/4438), [#5248](https://github.com/mikf/gallery-dl/issues/5248))
- [deviantart] add `comments-avatars` option ([#4995](https://github.com/mikf/gallery-dl/issues/4995))
- [deviantart] extend `metadata` option ([#5175](https://github.com/mikf/gallery-dl/issues/5175))
- [flickr] add `contexts` option ([#5324](https://github.com/mikf/gallery-dl/issues/5324))
- [gelbooru:favorite] add `order-posts` option ([#5220](https://github.com/mikf/gallery-dl/issues/5220))
- [kemonoparty] add `order-revisions` option ([#5334](https://github.com/mikf/gallery-dl/issues/5334))
- [vipergirls] add `like` option ([#4166](https://github.com/mikf/gallery-dl/issues/4166))
- [vipergirls] add `domain` option ([#4166](https://github.com/mikf/gallery-dl/issues/4166))
### Downloaders
- [http] add MIME type and signature for `.mov` files ([#5287](https://github.com/mikf/gallery-dl/issues/5287))
### Docker
- build images from source instead of PyPI package
- build `linux/arm64` images ([#5227](https://github.com/mikf/gallery-dl/issues/5227))
- build images on every push to master
- tag images as `YYYY.MM.DD`
- tag the most recent build from master as `dev`
- tag the most recent release build as `latest`
- reduce image size ([#5097](https://github.com/mikf/gallery-dl/issues/5097))
### Miscellaneous
- [formatter] fix local DST datetime offsets for `:O`
- build Linux executable on Ubuntu 22.04 LTS ([#4184](https://github.com/mikf/gallery-dl/issues/4184))
- automatically create directories for logging files ([#5249](https://github.com/mikf/gallery-dl/issues/5249))
## 1.26.8 - 2024-02-17 ## 1.26.8 - 2024-02-17
### Extractors ### Extractors
#### Additions #### Additions

View File

@ -1,7 +1,21 @@
FROM python:alpine FROM python:alpine
RUN python3 -m pip install --no-cache-dir -U pip && \ ENV LANG=C.UTF-8
python3 -m pip install --no-cache-dir -U gallery-dl yt-dlp
RUN apk update && \ RUN : \
apk add --no-cache ffmpeg && \ && apk --no-interactive update \
rm -rf /var/cache/apk/* && apk --no-cache --no-interactive add ffmpeg \
&& rm -rf /var/cache/apk \
&& :
RUN : \
&& python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \
pip \
&& python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \
https://github.com/mikf/gallery-dl/archive/refs/heads/master.tar.gz \
yt-dlp \
&& rm -rf /root/.cache/pip \
&& find /usr/local/lib/python3.*/site-packages/setuptools -name __pycache__ -exec rm -rf {} + \
&& find /usr/local/lib/python3.*/site-packages/wheel -name __pycache__ -exec rm -rf {} + \
&& :
ENTRYPOINT [ "gallery-dl" ] ENTRYPOINT [ "gallery-dl" ]

View File

@ -7,8 +7,8 @@ to download image galleries and collections
from several image hosting sites from several image hosting sites
(see `Supported Sites <docs/supportedsites.md>`__). (see `Supported Sites <docs/supportedsites.md>`__).
It is a cross-platform tool It is a cross-platform tool
with many `configuration options <docs/configuration.rst>`__ with many `configuration options <https://gdl-org.github.io/docs/configuration.html>`__
and powerful `filenaming capabilities <docs/formatting.md>`__. and powerful `filenaming capabilities <https://gdl-org.github.io/docs/formatting.html>`__.
|pypi| |build| |pypi| |build|
@ -72,9 +72,9 @@ Standalone Executable
Prebuilt executable files with a Python interpreter and Prebuilt executable files with a Python interpreter and
required Python packages included are available for required Python packages included are available for
- `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.exe>`__ - `Windows <https://github.com/mikf/gallery-dl/releases/download/v1.26.9/gallery-dl.exe>`__
(Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__) (Requires `Microsoft Visual C++ Redistributable Package (x86) <https://aka.ms/vs/17/release/vc_redist.x86.exe>`__)
- `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.8/gallery-dl.bin>`__ - `Linux <https://github.com/mikf/gallery-dl/releases/download/v1.26.9/gallery-dl.bin>`__
Nightly Builds Nightly Builds
@ -234,7 +234,7 @@ Documentation
------------- -------------
A list of all available configuration options and their descriptions A list of all available configuration options and their descriptions
can be found in `<docs/configuration.rst>`__. can be found at `<https://gdl-org.github.io/docs/configuration.html>`__.
| For a default configuration file with available options set to their | For a default configuration file with available options set to their
default values, see `<docs/gallery-dl.conf>`__. default values, see `<docs/gallery-dl.conf>`__.
@ -330,7 +330,7 @@ CAPTCHA or similar, or has not been implemented yet, you can use the
cookies from a browser login session and input them into *gallery-dl*. cookies from a browser login session and input them into *gallery-dl*.
This can be done via the This can be done via the
`cookies <docs/configuration.rst#extractorcookies>`__ `cookies <https://gdl-org.github.io/docs/configuration.html#extractor-cookies>`__
option in your configuration file by specifying option in your configuration file by specifying
- | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon

View File

@ -0,0 +1,20 @@
<!DOCTYPE html>
<html lang="en-US">
<head>
<meta charset="UTF-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
{% seo %}
<link rel="stylesheet" href="{{ "/assets/css/style.css?v=" | append: site.github.build_revision | relative_url }}">
<script src="links.js"></script>
</head>
<body>
<div class="container-lg px-3 my-5 markdown-body">
{{ content }}
</div>
</body>
</html>

View File

@ -337,6 +337,15 @@ Description
filename extension (``file.1.ext``, ``file.2.ext``, etc.) filename extension (``file.1.ext``, ``file.2.ext``, etc.)
extractor.*.skip-filter
-----------------------
Type
``string``
Description
Python expression controlling which skipped files to count towards
``"abort"`` / ``"terminate"`` / ``"exit"``.
extractor.*.sleep extractor.*.sleep
----------------- -----------------
Type Type
@ -358,12 +367,39 @@ Description
i.e. before starting a new extractor. i.e. before starting a new extractor.
extractor.*.sleep-429
---------------------
Type
|Duration|_
Default
``60``
Description
Number of seconds to sleep when receiving a `429 Too Many Requests`
response before `retrying <extractor.*.retries_>`__ the request.
extractor.*.sleep-request extractor.*.sleep-request
------------------------- -------------------------
Type Type
|Duration|_ |Duration|_
Default Default
``0`` * ``"0.5-1.5"``
``[Danbooru]``, ``[E621]``, ``[foolfuuka]:search``, ``itaku``,
``newgrounds``, ``[philomena]``, ``pixiv:novel``, ``plurk``,
``poipiku`` , ``pornpics``, ``soundgasm``, ``urlgalleries``,
``vk``, ``zerochan``
* ``"1.0-2.0"``
``flickr``, ``weibo``, ``[wikimedia]``
* ``"2.0-4.0"``
``behance``, ``imagefap``, ``[Nijie]``
* ``"3.0-6.0"``
``exhentai``, ``idolcomplex``, ``[reactor]``, ``readcomiconline``
* ``"6.0-6.1"``
``twibooru``
* ``"6.0-12.0"``
``instagram``
* ``0``
otherwise
Description Description
Minimal time interval in seconds between each HTTP request Minimal time interval in seconds between each HTTP request
during data extraction. during data extraction.
@ -382,6 +418,7 @@ Description
Specifying username and password is required for Specifying username and password is required for
* ``nijie`` * ``nijie``
* ``horne``
and optional for and optional for
@ -389,8 +426,12 @@ Description
* ``aryion`` * ``aryion``
* ``atfbooru`` (*) * ``atfbooru`` (*)
* ``bluesky`` * ``bluesky``
* ``booruvar`` (*)
* ``coomerparty``
* ``danbooru`` (*) * ``danbooru`` (*)
* ``deviantart``
* ``e621`` (*) * ``e621`` (*)
* ``e6ai`` (*)
* ``e926`` (*) * ``e926`` (*)
* ``exhentai`` * ``exhentai``
* ``idolcomplex`` * ``idolcomplex``
@ -401,7 +442,6 @@ Description
* ``mangoxo`` * ``mangoxo``
* ``pillowfort`` * ``pillowfort``
* ``sankaku`` * ``sankaku``
* ``seisoparty``
* ``subscribestar`` * ``subscribestar``
* ``tapas`` * ``tapas``
* ``tsumino`` * ``tsumino``
@ -417,7 +457,7 @@ Description
the API key found in your user profile, not the actual account password. the API key found in your user profile, not the actual account password.
Note: Leave the ``password`` value empty or undefined Note: Leave the ``password`` value empty or undefined
to get prompted for a passeword when performing a login to be prompted for a passeword when performing a login
(see `getpass() <https://docs.python.org/3/library/getpass.html#getpass.getpass>`__). (see `getpass() <https://docs.python.org/3/library/getpass.html#getpass.getpass>`__).
@ -557,8 +597,8 @@ extractor.*.browser
Type Type
``string`` ``string``
Default Default
* ``"firefox"`` for ``patreon``, ``mangapark``, and ``mangasee`` * ``"firefox"``: ``artstation``, ``mangasee``, ``patreon``, ``pixiv:series``, ``twitter``
* ``null`` everywhere else * ``null``: otherwise
Example Example
* ``"chrome:macos"`` * ``"chrome:macos"``
Description Description
@ -633,8 +673,8 @@ extractor.*.tls12
Type Type
``bool`` ``bool``
Default Default
* ``true`` * ``false``: ``patreon``, ``pixiv:series``
* ``false`` for ``patreon``, ``pixiv:series`` * ``true``: otherwise
Description Description
Allow selecting TLS 1.2 cipher suites. Allow selecting TLS 1.2 cipher suites.
@ -813,6 +853,22 @@ Description
An alternative `format string`_ to build archive IDs with. An alternative `format string`_ to build archive IDs with.
extractor.*.archive-mode
------------------------
Type
``string``
Default
``"file"``
Description
Controls when to write `archive IDs <extractor.*.archive-format_>`__
to the archive database.
* ``"file"``: Write IDs immediately
after completing or skipping a file download.
* ``"memory"``: Keep IDs in memory
and only write them after successful job completion.
extractor.*.archive-prefix extractor.*.archive-prefix
-------------------------- --------------------------
Type Type
@ -836,6 +892,65 @@ Description
for available ``PRAGMA`` statements and further details. for available ``PRAGMA`` statements and further details.
extractor.*.actions
-------------------
Type
* ``object`` (`pattern` -> `action`)
* ``list`` of ``lists`` with 2 ``strings`` as elements
Example
.. code:: json
{
"error" : "status |= 1",
"warning:(?i)unable to .+": "exit 127",
"info:Logging in as .+" : "level = debug"
}
.. code:: json
[
["error" , "status |= 1" ],
["warning:(?i)unable to .+", "exit 127" ],
["info:Logging in as .+" , "level = debug"]
]
Description
Perform an ``action`` when logging a message matched by ``pattern``.
``pattern`` is parsed as severity level (``debug``, ``info``, ``warning``, ``error``, or integer value)
followed by an optional `Python Regular Expression <https://docs.python.org/3/library/re.html#regular-expression-syntax>`__
separated by a colon ``:``.
Using ``*`` as `level` or leaving it empty
matches logging messages of all levels
(e.g. ``*:<re>`` or ``:<re>``).
``action`` is parsed as action type
followed by (optional) arguments.
Supported Action Types:
``status``:
| Modify job exit status.
| Expected syntax is ``<operator> <value>`` (e.g. ``= 100``).
Supported operators are
``=`` (assignment),
``&`` (bitwise AND),
``|`` (bitwise OR),
``^`` (bitwise XOR).
``level``:
| Modify severity level of the current logging message.
| Can be one of ``debug``, ``info``, ``warning``, ``error`` or an integer value.
``print``
Write argument to stdout.
``restart``:
Restart the current extractor run.
``wait``:
Stop execution until Enter is pressed.
``exit``:
Exit the program with the given argument as exit status.
extractor.*.postprocessors extractor.*.postprocessors
-------------------------- --------------------------
Type Type
@ -1872,6 +1987,20 @@ Description
from `linking your Flickr account to gallery-dl <OAuth_>`__. from `linking your Flickr account to gallery-dl <OAuth_>`__.
extractor.flickr.contexts
-------------------------
Type
``bool``
Default
``false``
Description
For each photo, return the albums and pools it belongs to
as ``set`` and ``pool`` metadata.
Note: This requires 1 additional API call per photo.
See `flickr.photos.getAllContexts <https://www.flickr.com/services/api/flickr.photos.getAllContexts.html>`__ for details.
extractor.flickr.exif extractor.flickr.exif
--------------------- ---------------------
Type Type
@ -1879,9 +2008,11 @@ Type
Default Default
``false`` ``false``
Description Description
Fetch `exif` and `camera` metadata for each photo. For each photo, return its EXIF/TIFF/GPS tags
as ``exif`` and ``camera`` metadata.
Note: This requires 1 additional API call per photo. Note: This requires 1 additional API call per photo.
See `flickr.photos.getExif <https://www.flickr.com/services/api/flickr.photos.getExif.html>`__ for details.
extractor.flickr.metadata extractor.flickr.metadata
@ -1901,7 +2032,7 @@ Description
It is possible to specify a custom list of metadata includes. It is possible to specify a custom list of metadata includes.
See `the extras parameter <https://www.flickr.com/services/api/flickr.people.getPhotos.html>`__ See `the extras parameter <https://www.flickr.com/services/api/flickr.people.getPhotos.html>`__
in `Flickr API docs <https://www.flickr.com/services/api/>`__ in `Flickr's API docs <https://www.flickr.com/services/api/>`__
for possible field names. for possible field names.
@ -2001,6 +2132,20 @@ Description
page. page.
extractor.gelbooru.favorite.order-posts
---------------------------------------
Type
``string``
Default
``"desc"``
Description
Controls the order in which favorited posts are returned.
* ``"asc"``: Ascending favorite date order (oldest first)
* ``"desc"``: Descending favorite date order (newest first)
* ``"reverse"``: Same as ``"asc"``
extractor.generic.enabled extractor.generic.enabled
------------------------- -------------------------
Type Type
@ -2287,6 +2432,16 @@ Description
Extract a user's direct messages as ``dms`` metadata. Extract a user's direct messages as ``dms`` metadata.
extractor.kemonoparty.announcements
-----------------------------------
Type
``bool``
Default
``false``
Description
Extract a user's announcements as ``announcements`` metadata.
extractor.kemonoparty.favorites extractor.kemonoparty.favorites
------------------------------- -------------------------------
Type Type
@ -2346,6 +2501,22 @@ Description
Note: This requires 1 additional HTTP request per post. Note: This requires 1 additional HTTP request per post.
extractor.kemonoparty.order-revisions
-------------------------------------
Type
``string``
Default
``"desc"``
Description
Controls the order in which
`revisions <extractor.kemonoparty.revisions_>`__
are returned.
* ``"asc"``: Ascending order (oldest first)
* ``"desc"``: Descending order (newest first)
* ``"reverse"``: Same as ``"asc"``
extractor.khinsider.format extractor.khinsider.format
-------------------------- --------------------------
Type Type
@ -2470,6 +2641,16 @@ Description
user IDs. user IDs.
extractor.[mastodon].cards
--------------------------
Type
``bool``
Default
``false``
Description
Fetch media from cards.
extractor.[mastodon].reblogs extractor.[mastodon].reblogs
---------------------------- ----------------------------
Type Type
@ -2829,14 +3010,24 @@ Description
`gppt <https://github.com/eggplants/get-pixivpy-token>`__. `gppt <https://github.com/eggplants/get-pixivpy-token>`__.
extractor.pixiv.embeds extractor.pixiv.novel.covers
---------------------- ----------------------------
Type Type
``bool`` ``bool``
Default Default
``false`` ``false``
Description Description
Download images embedded in novels. Download cover images.
extractor.pixiv.novel.embeds
----------------------------
Type
``bool``
Default
``false``
Description
Download embedded images.
extractor.pixiv.novel.full-series extractor.pixiv.novel.full-series
@ -3286,7 +3477,7 @@ Examples
* ``["jpeg", "webp"]`` * ``["jpeg", "webp"]``
Description Description
Only include assets that are in the specified file types. ``all`` can be Only include assets that are in the specified file types. ``all`` can be
used to specifiy all file types. Valid values are: used to specify all file types. Valid values are:
* Grids: ``png``, ``jpeg``, ``jpg``, ``webp`` * Grids: ``png``, ``jpeg``, ``jpg``, ``webp``
* Heroes: ``png``, ``jpeg``, ``jpg``, ``webp`` * Heroes: ``png``, ``jpeg``, ``jpg``, ``webp``
@ -3326,7 +3517,7 @@ Examples
* ``["fr", "it"]`` * ``["fr", "it"]``
Description Description
Only include assets that are in the specified languages. ``all`` can be Only include assets that are in the specified languages. ``all`` can be
used to specifiy all languages. Valid values are `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639-1>`__ used to specify all languages. Valid values are `ISO 639-1 <https://en.wikipedia.org/wiki/ISO_639-1>`__
language codes. language codes.
@ -3771,6 +3962,32 @@ Description
* ``"wait"``: Wait until rate limit reset * ``"wait"``: Wait until rate limit reset
extractor.twitter.relogin
-------------------------
Type
``bool``
Default
``true``
Description
| When receiving a "Could not authenticate you" error while logged in with
`username & passeword <extractor.*.username & .password_>`__,
| refresh the current login session and
try to continue from where it left off.
extractor.twitter.locked
------------------------
Type
``string``
Default
``"abort"``
Description
Selects how to handle "account is temporarily locked" errors.
* ``"abort"``: Raise an error and stop extraction
* ``"wait"``: Wait until the account is unlocked and retry
extractor.twitter.replies extractor.twitter.replies
------------------------- -------------------------
Type Type
@ -3909,6 +4126,31 @@ Description
``"raw"``, ``"full"``, ``"regular"``, ``"small"``, and ``"thumb"``. ``"raw"``, ``"full"``, ``"regular"``, ``"small"``, and ``"thumb"``.
extractor.vipergirls.domain
---------------------------
Type
``string``
Default
``"vipergirls.to"``
Description
Specifies the domain used by ``vipergirls`` extractors.
For example ``"viper.click"`` if the main domain is blocked or to bypass Cloudflare,
extractor.vipergirls.like
-------------------------
Type
``bool``
Default
``false``
Description
Automatically `like` posts after downloading their images.
Note: Requires `login <extractor.*.username & .password_>`__
or `cookies <extractor.*.cookies_>`__
extractor.vsco.videos extractor.vsco.videos
--------------------- ---------------------
Type Type
@ -4039,7 +4281,7 @@ extractor.weibo.retweets
Type Type
``bool`` ``bool``
Default Default
``true`` ``false``
Description Description
Fetch media from retweeted posts. Fetch media from retweeted posts.
@ -4714,10 +4956,33 @@ output.colors
Type Type
``object`` (`key` -> `ANSI color`) ``object`` (`key` -> `ANSI color`)
Default Default
``{"success": "1;32", "skip": "2"}`` .. code:: json
{
"success": "1;32",
"skip" : "2",
"debug" : "0;37",
"info" : "1;37",
"warning": "1;33",
"error" : "1;31"
}
Description Description
Controls the `ANSI colors <https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797#colors--graphics-mode>`__ Controls the
used with |mode: color|__ for successfully downloaded or skipped files. `ANSI colors <https://gist.github.com/fnky/458719343aabd01cfb17a3a4f7296797#colors--graphics-mode>`__
used for various outputs.
Output for |mode: color|__
* ``success``: successfully downloaded files
* ``skip``: skipped files
Logging Messages:
* ``debug``: debug logging messages
* ``info``: info logging messages
* ``warning``: warning logging messages
* ``error``: error logging messages
.. __: `output.mode`_ .. __: `output.mode`_
@ -4727,7 +4992,7 @@ output.ansi
Type Type
``bool`` ``bool``
Default Default
``false`` ``true``
Description Description
| On Windows, enable ANSI escape sequences and colored output | On Windows, enable ANSI escape sequences and colored output
| by setting the ``ENABLE_VIRTUAL_TERMINAL_PROCESSING`` flag for stdout and stderr. | by setting the ``ENABLE_VIRTUAL_TERMINAL_PROCESSING`` flag for stdout and stderr.
@ -5784,7 +6049,7 @@ How To
* choose a name * choose a name
* select "installed app" * select "installed app"
* set ``http://localhost:6414/`` as "redirect uri" * set ``http://localhost:6414/`` as "redirect uri"
* solve the "I'm not a rebot" reCATCHA if needed * solve the "I'm not a robot" reCAPTCHA if needed
* click "create app" * click "create app"
* copy the client id (third line, under your application's name and * copy the client id (third line, under your application's name and
@ -5932,7 +6197,7 @@ Description
* format * format
* General format string for logging messages * General format string for logging messages
or a dictionary with format strings for each loglevel. or an ``object`` with format strings for each loglevel.
In addition to the default In addition to the default
`LogRecord attributes <https://docs.python.org/3/library/logging.html#logrecord-attributes>`__, `LogRecord attributes <https://docs.python.org/3/library/logging.html#logrecord-attributes>`__,

44
docs/links.js Normal file
View File

@ -0,0 +1,44 @@
"use strict";
function add_header_links()
{
let style = document.createElement("style");
style.id = "headerlinks"
document.head.appendChild(style);
style.sheet.insertRule(
"a.headerlink {" +
" visibility: hidden;" +
" text-decoration: none;" +
" font-size: 0.8em;" +
" padding: 0 4px 0 4px;" +
"}");
style.sheet.insertRule(
":hover > a.headerlink {" +
" visibility: visible;" +
"}");
let headers = document.querySelectorAll("h2, h3, h4, h5, h6");
for (let i = 0, len = headers.length; i < len; ++i)
{
let header = headers[i];
let id = header.id || header.parentNode.id;
if (!id)
continue;
let link = document.createElement("a");
link.href = "#" + id;
link.className = "headerlink";
link.textContent = "¶";
header.appendChild(link);
}
}
if (document.readyState !== "loading") {
add_header_links();
} else {
document.addEventListener("DOMContentLoaded", add_header_links);
}

View File

@ -29,6 +29,7 @@
## Output Options: ## Output Options:
-q, --quiet Activate quiet mode -q, --quiet Activate quiet mode
-w, --warning Print only warnings and errors
-v, --verbose Print various debugging information -v, --verbose Print various debugging information
-g, --get-urls Print URLs instead of downloading -g, --get-urls Print URLs instead of downloading
-G, --resolve-urls Print URLs instead of downloading; resolve -G, --resolve-urls Print URLs instead of downloading; resolve
@ -48,12 +49,12 @@
extractors but cannot be handled, to FILE extractors but cannot be handled, to FILE
--write-pages Write downloaded intermediary pages to files in --write-pages Write downloaded intermediary pages to files in
the current directory to debug problems the current directory to debug problems
--no-colors Do not emit ANSI color codes in output
## Downloader Options: ## Downloader Options:
-r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M) -r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M)
-R, --retries N Maximum number of retries for failed HTTP -R, --retries N Maximum number of retries for failed HTTP
requests or -1 for infinite retries (default: requests or -1 for infinite retries (default: 4)
4)
--http-timeout SECONDS Timeout for HTTP connections (default: 30.0) --http-timeout SECONDS Timeout for HTTP connections (default: 30.0)
--sleep SECONDS Number of seconds to wait before each download. --sleep SECONDS Number of seconds to wait before each download.
This can be either a constant value or a range This can be either a constant value or a range

View File

@ -790,7 +790,7 @@ Consider all listed sites to potentially be NSFW.
<tr> <tr>
<td>Skeb</td> <td>Skeb</td>
<td>https://skeb.jp/</td> <td>https://skeb.jp/</td>
<td>Followed Users, Posts, Search Results, User Profiles</td> <td>Followed Creators, Followed Users, Posts, Search Results, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
@ -838,7 +838,7 @@ Consider all listed sites to potentially be NSFW.
<tr> <tr>
<td>Tapas</td> <td>Tapas</td>
<td>https://tapas.io/</td> <td>https://tapas.io/</td>
<td>Episodes, Series</td> <td>Creators, Episodes, Series</td>
<td>Supported</td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@ -898,7 +898,7 @@ Consider all listed sites to potentially be NSFW.
<tr> <tr>
<td>Twitter</td> <td>Twitter</td>
<td>https://twitter.com/</td> <td>https://twitter.com/</td>
<td>Avatars, Backgrounds, Bookmarks, Communities, Events, Followed Users, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets, User Profiles</td> <td>Avatars, Backgrounds, Bookmarks, Communities, Events, Followed Users, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Quotes, Search Results, Timelines, Tweets, User Profiles</td>
<td>Supported</td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@ -940,14 +940,14 @@ Consider all listed sites to potentially be NSFW.
<tr> <tr>
<td>VSCO</td> <td>VSCO</td>
<td>https://vsco.co/</td> <td>https://vsco.co/</td>
<td>Collections, individual Images, Spaces, User Profiles</td> <td>Avatars, Collections, individual Images, Spaces, User Profiles</td>
<td></td> <td></td>
</tr> </tr>
<tr> <tr>
<td>Wallhaven</td> <td>Wallhaven</td>
<td>https://wallhaven.cc/</td> <td>https://wallhaven.cc/</td>
<td>Collections, individual Images, Search Results, User Profiles</td> <td>Collections, individual Images, Search Results, User Profiles</td>
<td><a href="configuration.rst#extractorwallhavenapi-key">API Key</a></td> <td><a href="https://gdl-org.github.io/docs/configuration.html#extractor-wallhaven-api-key">API Key</a></td>
</tr> </tr>
<tr> <tr>
<td>Wallpaper Cave</td> <td>Wallpaper Cave</td>
@ -965,7 +965,7 @@ Consider all listed sites to potentially be NSFW.
<td>Weasyl</td> <td>Weasyl</td>
<td>https://www.weasyl.com/</td> <td>https://www.weasyl.com/</td>
<td>Favorites, Folders, Journals, Submissions</td> <td>Favorites, Folders, Journals, Submissions</td>
<td><a href="configuration.rst#extractorweasylapi-key">API Key</a></td> <td><a href="https://gdl-org.github.io/docs/configuration.html#extractor-weasyl-api-key">API Key</a></td>
</tr> </tr>
<tr> <tr>
<td>webmshare</td> <td>webmshare</td>
@ -1103,7 +1103,7 @@ Consider all listed sites to potentially be NSFW.
<td>Booruvar</td> <td>Booruvar</td>
<td>https://booru.borvar.art/</td> <td>https://booru.borvar.art/</td>
<td>Pools, Popular Images, Posts, Tag Searches</td> <td>Pools, Popular Images, Posts, Tag Searches</td>
<td></td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@ -1125,7 +1125,7 @@ Consider all listed sites to potentially be NSFW.
<td>e6AI</td> <td>e6AI</td>
<td>https://e6ai.net/</td> <td>https://e6ai.net/</td>
<td>Favorites, Pools, Popular Images, Posts, Tag Searches</td> <td>Favorites, Pools, Popular Images, Posts, Tag Searches</td>
<td></td> <td>Supported</td>
</tr> </tr>
<tr> <tr>
@ -1319,7 +1319,7 @@ Consider all listed sites to potentially be NSFW.
<td>Derpibooru</td> <td>Derpibooru</td>
<td>https://derpibooru.org/</td> <td>https://derpibooru.org/</td>
<td>Galleries, Posts, Search Results</td> <td>Galleries, Posts, Search Results</td>
<td><a href="configuration.rst#extractorderpibooruapi-key">API Key</a></td> <td><a href="https://gdl-org.github.io/docs/configuration.html#extractor-derpibooru-api-key">API Key</a></td>
</tr> </tr>
<tr> <tr>
<td>Ponybooru</td> <td>Ponybooru</td>
@ -1331,7 +1331,7 @@ Consider all listed sites to potentially be NSFW.
<td>Furbooru</td> <td>Furbooru</td>
<td>https://furbooru.org/</td> <td>https://furbooru.org/</td>
<td>Galleries, Posts, Search Results</td> <td>Galleries, Posts, Search Results</td>
<td></td> <td>API Key</td>
</tr> </tr>
<tr> <tr>
@ -1499,6 +1499,12 @@ Consider all listed sites to potentially be NSFW.
<td>Articles</td> <td>Articles</td>
<td></td> <td></td>
</tr> </tr>
<tr>
<td>wiki.gg</td>
<td>https://www.wiki.gg/</td>
<td>Articles</td>
<td></td>
</tr>
<tr> <tr>
<td>Super Mario Wiki</td> <td>Super Mario Wiki</td>
<td>https://www.mariowiki.com/</td> <td>https://www.mariowiki.com/</td>
@ -1616,19 +1622,19 @@ Consider all listed sites to potentially be NSFW.
<tr> <tr>
<td>mastodon.social</td> <td>mastodon.social</td>
<td>https://mastodon.social/</td> <td>https://mastodon.social/</td>
<td>Bookmarks, Followed Users, Images from Statuses, User Profiles</td> <td>Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td> <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr> </tr>
<tr> <tr>
<td>Pawoo</td> <td>Pawoo</td>
<td>https://pawoo.net/</td> <td>https://pawoo.net/</td>
<td>Bookmarks, Followed Users, Images from Statuses, User Profiles</td> <td>Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td> <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr> </tr>
<tr> <tr>
<td>baraag</td> <td>baraag</td>
<td>https://baraag.net/</td> <td>https://baraag.net/</td>
<td>Bookmarks, Followed Users, Images from Statuses, User Profiles</td> <td>Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles</td>
<td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td> <td><a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a></td>
</tr> </tr>

View File

@ -38,6 +38,11 @@ def main():
except ImportError: except ImportError:
import toml import toml
config.load(args.configs_toml, strict=True, loads=toml.loads) config.load(args.configs_toml, strict=True, loads=toml.loads)
if not args.colors:
output.ANSI = False
config.set((), "colors", False)
if util.WINDOWS:
config.set(("output",), "ansi", False)
if args.filename: if args.filename:
filename = args.filename filename = args.filename
if filename == "/O": if filename == "/O":
@ -86,7 +91,7 @@ def main():
signal.signal(signal_num, signal.SIG_IGN) signal.signal(signal_num, signal.SIG_IGN)
# enable ANSI escape sequences on Windows # enable ANSI escape sequences on Windows
if util.WINDOWS and config.get(("output",), "ansi"): if util.WINDOWS and config.get(("output",), "ansi", output.COLORS):
from ctypes import windll, wintypes, byref from ctypes import windll, wintypes, byref
kernel32 = windll.kernel32 kernel32 = windll.kernel32
mode = wintypes.DWORD() mode = wintypes.DWORD()
@ -113,7 +118,7 @@ def main():
# loglevels # loglevels
output.configure_logging(args.loglevel) output.configure_logging(args.loglevel)
if args.loglevel >= logging.ERROR: if args.loglevel >= logging.WARNING:
config.set(("output",), "mode", "null") config.set(("output",), "mode", "null")
config.set(("downloader",), "progress", None) config.set(("downloader",), "progress", None)
elif args.loglevel <= logging.DEBUG: elif args.loglevel <= logging.DEBUG:

98
gallery_dl/archive.py Normal file
View File

@ -0,0 +1,98 @@
# -*- coding: utf-8 -*-
# Copyright 2024 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.
"""Download Archives"""
import os
import sqlite3
from . import formatter
class DownloadArchive():
def __init__(self, path, format_string, pragma=None,
cache_key="_archive_key"):
try:
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
except sqlite3.OperationalError:
os.makedirs(os.path.dirname(path))
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
con.isolation_level = None
self.keygen = formatter.parse(format_string).format_map
self.connection = con
self.close = con.close
self.cursor = cursor = con.cursor()
self._cache_key = cache_key
if pragma:
for stmt in pragma:
cursor.execute("PRAGMA " + stmt)
try:
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY) WITHOUT ROWID")
except sqlite3.OperationalError:
# fallback for missing WITHOUT ROWID support (#553)
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY)")
def add(self, kwdict):
"""Add item described by 'kwdict' to archive"""
key = kwdict.get(self._cache_key) or self.keygen(kwdict)
self.cursor.execute(
"INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,))
def check(self, kwdict):
"""Return True if the item described by 'kwdict' exists in archive"""
key = kwdict[self._cache_key] = self.keygen(kwdict)
self.cursor.execute(
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
return self.cursor.fetchone()
def finalize(self):
pass
class DownloadArchiveMemory(DownloadArchive):
def __init__(self, path, format_string, pragma=None,
cache_key="_archive_key"):
DownloadArchive.__init__(self, path, format_string, pragma, cache_key)
self.keys = set()
def add(self, kwdict):
self.keys.add(
kwdict.get(self._cache_key) or
self.keygen(kwdict))
def check(self, kwdict):
key = kwdict[self._cache_key] = self.keygen(kwdict)
if key in self.keys:
return True
self.cursor.execute(
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
return self.cursor.fetchone()
def finalize(self):
if not self.keys:
return
cursor = self.cursor
with self.connection:
try:
cursor.execute("BEGIN")
except sqlite3.OperationalError:
pass
stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)"
if len(self.keys) < 100:
for key in self.keys:
cursor.execute(stmt, (key,))
else:
cursor.executemany(stmt, ((key,) for key in self.keys))

View File

@ -10,7 +10,6 @@
# https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/cookies.py # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/cookies.py
import binascii import binascii
import contextlib
import ctypes import ctypes
import logging import logging
import os import os
@ -147,7 +146,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None,
set_cookie(Cookie( set_cookie(Cookie(
0, name, value, None, False, 0, name, value, None, False,
domain, bool(domain), domain.startswith("."), domain, bool(domain), domain.startswith("."),
path, bool(path), secure, expires, False, None, None, {}, path, bool(path), secure, expires or None, False,
None, None, {},
)) ))
if failed_cookies > 0: if failed_cookies > 0:
@ -682,7 +682,8 @@ def _get_gnome_keyring_password(browser_keyring_name):
# lists all keys and presumably searches for its key in the list. # lists all keys and presumably searches for its key in the list.
# It appears that we must do the same. # It appears that we must do the same.
# https://github.com/jaraco/keyring/issues/556 # https://github.com/jaraco/keyring/issues/556
with contextlib.closing(secretstorage.dbus_init()) as con: con = secretstorage.dbus_init()
try:
col = secretstorage.get_default_collection(con) col = secretstorage.get_default_collection(con)
label = browser_keyring_name + " Safe Storage" label = browser_keyring_name + " Safe Storage"
for item in col.get_all_items(): for item in col.get_all_items():
@ -691,6 +692,8 @@ def _get_gnome_keyring_password(browser_keyring_name):
else: else:
_log_error("Failed to read from GNOME keyring") _log_error("Failed to read from GNOME keyring")
return b"" return b""
finally:
con.close()
def _get_linux_keyring_password(browser_keyring_name, keyring): def _get_linux_keyring_password(browser_keyring_name, keyring):
@ -857,7 +860,7 @@ class DatabaseConnection():
def Popen_communicate(*args): def Popen_communicate(*args):
proc = subprocess.Popen( proc = util.Popen(
args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
try: try:
stdout, stderr = proc.communicate() stdout, stderr = proc.communicate()
@ -999,6 +1002,12 @@ def _decrypt_windows_dpapi(ciphertext):
def _find_most_recently_used_file(root, filename): def _find_most_recently_used_file(root, filename):
# if the provided root points to an exact profile path
# check if it contains the wanted filename
first_choice = os.path.join(root, filename)
if os.path.exists(first_choice):
return first_choice
# if there are multiple browser profiles, take the most recently used one # if there are multiple browser profiles, take the most recently used one
paths = [] paths = []
for curr_root, dirs, files in os.walk(root): for curr_root, dirs, files in os.walk(root):

View File

@ -98,6 +98,8 @@ class HttpDownloader(DownloaderBase):
metadata = self.metadata metadata = self.metadata
kwdict = pathfmt.kwdict kwdict = pathfmt.kwdict
expected_status = kwdict.get(
"_http_expected_status", ())
adjust_extension = kwdict.get( adjust_extension = kwdict.get(
"_http_adjust_extension", self.adjust_extension) "_http_adjust_extension", self.adjust_extension)
@ -151,7 +153,7 @@ class HttpDownloader(DownloaderBase):
# check response # check response
code = response.status_code code = response.status_code
if code == 200: # OK if code == 200 or code in expected_status: # OK
offset = 0 offset = 0
size = response.headers.get("Content-Length") size = response.headers.get("Content-Length")
elif code == 206: # Partial Content elif code == 206: # Partial Content
@ -399,6 +401,9 @@ MIME_TYPES = {
"video/webm": "webm", "video/webm": "webm",
"video/ogg" : "ogg", "video/ogg" : "ogg",
"video/mp4" : "mp4", "video/mp4" : "mp4",
"video/m4v" : "m4v",
"video/x-m4v": "m4v",
"video/quicktime": "mov",
"audio/wav" : "wav", "audio/wav" : "wav",
"audio/x-wav": "wav", "audio/x-wav": "wav",
@ -440,7 +445,9 @@ SIGNATURE_CHECKS = {
"cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00",
"psd" : lambda s: s[0:4] == b"8BPS", "psd" : lambda s: s[0:4] == b"8BPS",
"mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in (
b"mp4", b"avc", b"iso", b"M4V")), b"mp4", b"avc", b"iso")),
"m4v" : lambda s: s[4:11] == b"ftypM4V",
"mov" : lambda s: s[4:12] == b"ftypqt ",
"webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3",
"ogg" : lambda s: s[0:4] == b"OggS", "ogg" : lambda s: s[0:4] == b"OggS",
"wav" : lambda s: (s[0:4] == b"RIFF" and "wav" : lambda s: (s[0:4] == b"RIFF" and

View File

@ -26,6 +26,9 @@ class _8chanExtractor(Extractor):
self.root = "https://8chan." + match.group(1) self.root = "https://8chan." + match.group(1)
Extractor.__init__(self, match) Extractor.__init__(self, match)
def _init(self):
self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2])
@memcache() @memcache()
def cookies_prepare(self): def cookies_prepare(self):
# fetch captcha cookies # fetch captcha cookies

View File

@ -40,6 +40,7 @@ class BlueskyExtractor(Extractor):
self.api = BlueskyAPI(self) self.api = BlueskyAPI(self)
self._user = self._user_did = None self._user = self._user_did = None
self.instance = self.root.partition("://")[2]
def items(self): def items(self):
for post in self.posts(): for post in self.posts():
@ -81,6 +82,7 @@ class BlueskyExtractor(Extractor):
if self._metadata_user: if self._metadata_user:
post["user"] = self._user or post["author"] post["user"] = self._user or post["author"]
post["instance"] = self.instance
post["post_id"] = pid post["post_id"] = pid
post["count"] = len(images) post["count"] = len(images)
post["date"] = text.parse_datetime( post["date"] = text.parse_datetime(
@ -315,7 +317,7 @@ class BlueskyAPI():
def get_author_feed(self, actor, filter="posts_and_author_threads"): def get_author_feed(self, actor, filter="posts_and_author_threads"):
endpoint = "app.bsky.feed.getAuthorFeed" endpoint = "app.bsky.feed.getAuthorFeed"
params = { params = {
"actor" : self._did_from_actor(actor), "actor" : self._did_from_actor(actor, True),
"filter": filter, "filter": filter,
"limit" : "100", "limit" : "100",
} }
@ -325,7 +327,7 @@ class BlueskyAPI():
endpoint = "app.bsky.feed.getFeed" endpoint = "app.bsky.feed.getFeed"
params = { params = {
"feed" : "at://{}/app.bsky.feed.generator/{}".format( "feed" : "at://{}/app.bsky.feed.generator/{}".format(
self._did_from_actor(actor, False), feed), self._did_from_actor(actor), feed),
"limit": "100", "limit": "100",
} }
return self._pagination(endpoint, params) return self._pagination(endpoint, params)
@ -342,7 +344,7 @@ class BlueskyAPI():
endpoint = "app.bsky.feed.getListFeed" endpoint = "app.bsky.feed.getListFeed"
params = { params = {
"list" : "at://{}/app.bsky.graph.list/{}".format( "list" : "at://{}/app.bsky.graph.list/{}".format(
self._did_from_actor(actor, False), list), self._did_from_actor(actor), list),
"limit": "100", "limit": "100",
} }
return self._pagination(endpoint, params) return self._pagination(endpoint, params)
@ -389,7 +391,7 @@ class BlueskyAPI():
} }
return self._pagination(endpoint, params, "posts") return self._pagination(endpoint, params, "posts")
def _did_from_actor(self, actor, user_did=True): def _did_from_actor(self, actor, user_did=False):
if actor.startswith("did:"): if actor.startswith("did:"):
did = actor did = actor
else: else:

View File

@ -54,7 +54,6 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor):
"album_id" : self.album_id, "album_id" : self.album_id,
"album_name" : text.unescape(info[0]), "album_name" : text.unescape(info[0]),
"album_size" : size[1:-1], "album_size" : size[1:-1],
"description": text.unescape(info[2]) if len(info) > 2 else "",
"count" : len(urls), "count" : len(urls),
} }

View File

@ -14,6 +14,7 @@ import ssl
import time import time
import netrc import netrc
import queue import queue
import getpass
import logging import logging
import datetime import datetime
import requests import requests
@ -21,6 +22,7 @@ import threading
from requests.adapters import HTTPAdapter from requests.adapters import HTTPAdapter
from .message import Message from .message import Message
from .. import config, text, util, cache, exception from .. import config, text, util, cache, exception
urllib3 = requests.packages.urllib3
class Extractor(): class Extractor():
@ -45,6 +47,8 @@ class Extractor():
def __init__(self, match): def __init__(self, match):
self.log = logging.getLogger(self.category) self.log = logging.getLogger(self.category)
self.url = match.string self.url = match.string
self.match = match
self.groups = match.groups()
self._cfgpath = ("extractor", self.category, self.subcategory) self._cfgpath = ("extractor", self.category, self.subcategory)
self._parentdir = "" self._parentdir = ""
@ -168,22 +172,25 @@ class Extractor():
requests.exceptions.ChunkedEncodingError, requests.exceptions.ChunkedEncodingError,
requests.exceptions.ContentDecodingError) as exc: requests.exceptions.ContentDecodingError) as exc:
msg = exc msg = exc
code = 0
except (requests.exceptions.RequestException) as exc: except (requests.exceptions.RequestException) as exc:
raise exception.HttpError(exc) raise exception.HttpError(exc)
else: else:
code = response.status_code code = response.status_code
if self._write_pages: if self._write_pages:
self._dump_response(response) self._dump_response(response)
if 200 <= code < 400 or fatal is None and \ if (
(400 <= code < 500) or not fatal and \ code < 400 or
(400 <= code < 429 or 431 <= code < 500): code < 500 and (not fatal and code != 429 or fatal is None)
):
if encoding: if encoding:
response.encoding = encoding response.encoding = encoding
return response return response
if notfound and code == 404: if notfound and code == 404:
raise exception.NotFoundError(notfound) raise exception.NotFoundError(notfound)
msg = "'{} {}' for '{}'".format(code, response.reason, url) msg = "'{} {}' for '{}'".format(
code, response.reason, response.url)
server = response.headers.get("Server") server = response.headers.get("Server")
if server and server.startswith("cloudflare") and \ if server and server.startswith("cloudflare") and \
code in (403, 503): code in (403, 503):
@ -194,7 +201,10 @@ class Extractor():
if b'name="captcha-bypass"' in content: if b'name="captcha-bypass"' in content:
self.log.warning("Cloudflare CAPTCHA") self.log.warning("Cloudflare CAPTCHA")
break break
if code not in retry_codes and code < 500:
if code == 429 and self._interval_429:
pass
elif code not in retry_codes and code < 500:
break break
finally: finally:
@ -203,15 +213,25 @@ class Extractor():
self.log.debug("%s (%s/%s)", msg, tries, retries+1) self.log.debug("%s (%s/%s)", msg, tries, retries+1)
if tries > retries: if tries > retries:
break break
self.sleep(
max(tries, self._interval()) if self._interval else tries, seconds = tries
"retry") if self._interval:
s = self._interval()
if seconds < s:
seconds = s
if code == 429 and self._interval_429:
s = self._interval_429()
if seconds < s:
seconds = s
self.wait(seconds=seconds, reason="429 Too Many Requests")
else:
self.sleep(seconds, "retry")
tries += 1 tries += 1
raise exception.HttpError(msg, response) raise exception.HttpError(msg, response)
def wait(self, seconds=None, until=None, adjust=1.0, def wait(self, seconds=None, until=None, adjust=1.0,
reason="rate limit reset"): reason="rate limit"):
now = time.time() now = time.time()
if seconds: if seconds:
@ -234,7 +254,7 @@ class Extractor():
if reason: if reason:
t = datetime.datetime.fromtimestamp(until).time() t = datetime.datetime.fromtimestamp(until).time()
isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second)
self.log.info("Waiting until %s for %s.", isotime, reason) self.log.info("Waiting until %s (%s)", isotime, reason)
time.sleep(seconds) time.sleep(seconds)
def sleep(self, seconds, reason): def sleep(self, seconds, reason):
@ -242,6 +262,15 @@ class Extractor():
seconds, reason) seconds, reason)
time.sleep(seconds) time.sleep(seconds)
def input(self, prompt, echo=True):
if echo:
try:
return input(prompt)
except (EOFError, OSError):
return None
else:
return getpass.getpass(prompt)
def _get_auth_info(self): def _get_auth_info(self):
"""Return authentication information as (username, password) tuple""" """Return authentication information as (username, password) tuple"""
username = self.config("username") username = self.config("username")
@ -274,6 +303,9 @@ class Extractor():
self.config("sleep-request", self.request_interval), self.config("sleep-request", self.request_interval),
self.request_interval_min, self.request_interval_min,
) )
self._interval_429 = util.build_duration_func(
self.config("sleep-429", 60),
)
if self._retries < 0: if self._retries < 0:
self._retries = float("inf") self._retries = float("inf")
@ -433,9 +465,11 @@ class Extractor():
if not path: if not path:
return return
path_tmp = path + ".tmp"
try: try:
with open(path, "w") as fp: with open(path_tmp, "w") as fp:
util.cookiestxt_store(fp, self.cookies) util.cookiestxt_store(fp, self.cookies)
os.replace(path_tmp, path)
except OSError as exc: except OSError as exc:
self.log.warning("cookies: %s", exc) self.log.warning("cookies: %s", exc)
@ -593,7 +627,7 @@ class GalleryExtractor(Extractor):
def __init__(self, match, url=None): def __init__(self, match, url=None):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.gallery_url = self.root + match.group(1) if url is None else url self.gallery_url = self.root + self.groups[0] if url is None else url
def items(self): def items(self):
self.login() self.login()
@ -668,7 +702,7 @@ class MangaExtractor(Extractor):
def __init__(self, match, url=None): def __init__(self, match, url=None):
Extractor.__init__(self, match) Extractor.__init__(self, match)
self.manga_url = url or self.root + match.group(1) self.manga_url = self.root + self.groups[0] if url is None else url
if self.config("chapter-reverse", False): if self.config("chapter-reverse", False):
self.reverse = not self.reverse self.reverse = not self.reverse
@ -730,17 +764,18 @@ class BaseExtractor(Extractor):
instances = () instances = ()
def __init__(self, match): def __init__(self, match):
if not self.category:
self._init_category(match)
Extractor.__init__(self, match) Extractor.__init__(self, match)
if not self.category:
self._init_category()
self._cfgpath = ("extractor", self.category, self.subcategory)
def _init_category(self, match): def _init_category(self):
for index, group in enumerate(match.groups()): for index, group in enumerate(self.groups):
if group is not None: if group is not None:
if index: if index:
self.category, self.root, info = self.instances[index-1] self.category, self.root, info = self.instances[index-1]
if not self.root: if not self.root:
self.root = text.root_from_url(match.group(0)) self.root = text.root_from_url(self.match.group(0))
self.config_instance = info.get self.config_instance = info.get
else: else:
self.root = group self.root = group
@ -800,12 +835,9 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address):
pass pass
if ssl_options or ssl_ciphers: if ssl_options or ssl_ciphers:
ssl_context = ssl.create_default_context() ssl_context = urllib3.connection.create_urllib3_context(
if ssl_options: options=ssl_options or None, ciphers=ssl_ciphers)
ssl_context.options |= ssl_options ssl_context.check_hostname = False
if ssl_ciphers:
ssl_context.set_ecdh_curve("prime256v1")
ssl_context.set_ciphers(ssl_ciphers)
else: else:
ssl_context = None ssl_context = None
@ -925,8 +957,6 @@ SSL_CIPHERS = {
} }
urllib3 = requests.packages.urllib3
# detect brotli support # detect brotli support
try: try:
BROTLI = urllib3.response.brotli is not None BROTLI = urllib3.response.brotli is not None

View File

@ -18,12 +18,12 @@ import binascii
import time import time
import re import re
BASE_PATTERN = ( BASE_PATTERN = (
r"(?:https?://)?(?:" r"(?:https?://)?(?:"
r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|" r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|"
r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)" r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)"
) )
DEFAULT_AVATAR = "https://a.deviantart.net/avatars/default.gif"
class DeviantartExtractor(Extractor): class DeviantartExtractor(Extractor):
@ -84,6 +84,16 @@ class DeviantartExtractor(Extractor):
else: else:
self.commit_journal = None self.commit_journal = None
def request(self, url, **kwargs):
if "fatal" not in kwargs:
kwargs["fatal"] = False
while True:
response = Extractor.request(self, url, **kwargs)
if response.status_code != 403 or \
b"Request blocked." not in response.content:
return response
self.wait(seconds=300, reason="CloudFront block")
def skip(self, num): def skip(self, num):
self.offset += num self.offset += num
return num return num
@ -177,6 +187,10 @@ class DeviantartExtractor(Extractor):
for comment in deviation["comments"]: for comment in deviation["comments"]:
user = comment["user"] user = comment["user"]
name = user["username"].lower() name = user["username"].lower()
if user["usericon"] == DEFAULT_AVATAR:
self.log.debug(
"Skipping avatar of '%s' (default)", name)
continue
_user_details.update(name, user) _user_details.update(name, user)
url = "{}/{}/avatar/".format(self.root, name) url = "{}/{}/avatar/".format(self.root, name)
@ -209,7 +223,9 @@ class DeviantartExtractor(Extractor):
"""Adjust the contents of a Deviation-object""" """Adjust the contents of a Deviation-object"""
if "index" not in deviation: if "index" not in deviation:
try: try:
if deviation["url"].startswith("https://sta.sh"): if deviation["url"].startswith((
"https://www.deviantart.com/stash/", "https://sta.sh",
)):
filename = deviation["content"]["src"].split("/")[5] filename = deviation["content"]["src"].split("/")[5]
deviation["index_base36"] = filename.partition("-")[0][1:] deviation["index_base36"] = filename.partition("-")[0][1:]
deviation["index"] = id_from_base36( deviation["index"] = id_from_base36(
@ -456,18 +472,12 @@ class DeviantartExtractor(Extractor):
def _limited_request(self, url, **kwargs): def _limited_request(self, url, **kwargs):
"""Limits HTTP requests to one every 2 seconds""" """Limits HTTP requests to one every 2 seconds"""
kwargs["fatal"] = None
diff = time.time() - DeviantartExtractor._last_request diff = time.time() - DeviantartExtractor._last_request
if diff < 2.0: if diff < 2.0:
self.sleep(2.0 - diff, "request") self.sleep(2.0 - diff, "request")
response = self.request(url, **kwargs)
while True: DeviantartExtractor._last_request = time.time()
response = self.request(url, **kwargs) return response
if response.status_code != 403 or \
b"Request blocked." not in response.content:
DeviantartExtractor._last_request = time.time()
return response
self.wait(seconds=180)
def _fetch_premium(self, deviation): def _fetch_premium(self, deviation):
try: try:
@ -585,7 +595,13 @@ class DeviantartAvatarExtractor(DeviantartExtractor):
return () return ()
icon = user["usericon"] icon = user["usericon"]
index = icon.rpartition("?")[2] if icon == DEFAULT_AVATAR:
self.log.debug("Skipping avatar of '%s' (default)", name)
return ()
_, sep, index = icon.rpartition("?")
if not sep:
index = "0"
formats = self.config("formats") formats = self.config("formats")
if not formats: if not formats:
@ -668,7 +684,8 @@ class DeviantartStashExtractor(DeviantartExtractor):
"""Extractor for sta.sh-ed deviations""" """Extractor for sta.sh-ed deviations"""
subcategory = "stash" subcategory = "stash"
archive_fmt = "{index}.{extension}" archive_fmt = "{index}.{extension}"
pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)"
r"/([a-z0-9]+)")
example = "https://sta.sh/abcde" example = "https://sta.sh/abcde"
skip = Extractor.skip skip = Extractor.skip
@ -689,7 +706,7 @@ class DeviantartStashExtractor(DeviantartExtractor):
if uuid: if uuid:
deviation = self.api.deviation(uuid) deviation = self.api.deviation(uuid)
deviation["index"] = text.parse_int(text.extr( deviation["index"] = text.parse_int(text.extr(
page, 'gmi-deviationid="', '"')) page, '\\"deviationId\\":', ','))
yield deviation yield deviation
return return
@ -1405,9 +1422,14 @@ class DeviantartOAuthAPI():
self.authenticate(None if public else self.refresh_token_key) self.authenticate(None if public else self.refresh_token_key)
kwargs["headers"] = self.headers kwargs["headers"] = self.headers
response = self.extractor.request(url, **kwargs) response = self.extractor.request(url, **kwargs)
data = response.json()
status = response.status_code
try:
data = response.json()
except ValueError:
self.log.error("Unable to parse API response")
data = {}
status = response.status_code
if 200 <= status < 400: if 200 <= status < 400:
if self.delay > self.delay_min: if self.delay > self.delay_min:
self.delay -= 1 self.delay -= 1
@ -1435,9 +1457,8 @@ class DeviantartOAuthAPI():
self.log.info( self.log.info(
"Register your own OAuth application and use its " "Register your own OAuth application and use its "
"credentials to prevent this error: " "credentials to prevent this error: "
"https://github.com/mikf/gallery-dl/blob/master/do" "https://gdl-org.github.io/docs/configuration.html"
"cs/configuration.rst#extractordeviantartclient-id" "#extractor-deviantart-client-id-client-secret")
"--client-secret")
else: else:
if log: if log:
self.log.error(msg) self.log.error(msg)

View File

@ -50,7 +50,7 @@ class ExhentaiExtractor(Extractor):
def request(self, url, **kwargs): def request(self, url, **kwargs):
response = Extractor.request(self, url, **kwargs) response = Extractor.request(self, url, **kwargs)
if response.history and response.headers.get("Content-Length") == "0": if "Cache-Control" not in response.headers and not response.content:
self.log.info("blank page") self.log.info("blank page")
raise exception.AuthorizationError() raise exception.AuthorizationError()
return response return response
@ -95,7 +95,11 @@ class ExhentaiExtractor(Extractor):
self.cookies.clear() self.cookies.clear()
response = self.request(url, method="POST", headers=headers, data=data) response = self.request(url, method="POST", headers=headers, data=data)
if b"You are now logged in as:" not in response.content: content = response.content
if b"You are now logged in as:" not in content:
if b"The captcha was not entered correctly" in content:
raise exception.AuthenticationError(
"CAPTCHA required. Use cookies instead.")
raise exception.AuthenticationError() raise exception.AuthenticationError()
# collect more cookies # collect more cookies
@ -437,7 +441,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor):
raise exception.AuthorizationError() raise exception.AuthorizationError()
if page.startswith(("Key missing", "Gallery not found")): if page.startswith(("Key missing", "Gallery not found")):
raise exception.NotFoundError("gallery") raise exception.NotFoundError("gallery")
if "hentai.org/mpv/" in page: if page.count("hentai.org/mpv/") > 1:
self.log.warning("Enabled Multi-Page Viewer is not supported") self.log.warning("Enabled Multi-Page Viewer is not supported")
return page return page

View File

@ -42,7 +42,8 @@ class FapelloPostExtractor(Extractor):
"type" : "video" if 'type="video' in page else "photo", "type" : "video" if 'type="video' in page else "photo",
"thumbnail": text.extr(page, 'poster="', '"'), "thumbnail": text.extr(page, 'poster="', '"'),
} }
url = text.extr(page, 'src="', '"') url = text.extr(page, 'src="', '"').replace(
".md", "").replace(".th", "")
yield Message.Directory, data yield Message.Directory, data
yield Message.Url, url, text.nameext_from_url(url, data) yield Message.Url, url, text.nameext_from_url(url, data)

View File

@ -77,6 +77,8 @@ class FlickrImageExtractor(FlickrExtractor):
photo = self.api.photos_getInfo(self.item_id) photo = self.api.photos_getInfo(self.item_id)
if self.api.exif: if self.api.exif:
photo.update(self.api.photos_getExif(self.item_id)) photo.update(self.api.photos_getExif(self.item_id))
if self.api.contexts:
photo.update(self.api.photos_getAllContexts(self.item_id))
if photo["media"] == "video" and self.api.videos: if photo["media"] == "video" and self.api.videos:
self.api._extract_video(photo) self.api._extract_video(photo)
@ -268,6 +270,8 @@ class FlickrAPI(oauth.OAuth1API):
self.exif = extractor.config("exif", False) self.exif = extractor.config("exif", False)
self.videos = extractor.config("videos", True) self.videos = extractor.config("videos", True)
self.contexts = extractor.config("contexts", False)
self.maxsize = extractor.config("size-max") self.maxsize = extractor.config("size-max")
if isinstance(self.maxsize, str): if isinstance(self.maxsize, str):
for fmt, fmtname, fmtwidth in self.FORMATS: for fmt, fmtname, fmtwidth in self.FORMATS:
@ -311,6 +315,13 @@ class FlickrAPI(oauth.OAuth1API):
params = {"user_id": user_id} params = {"user_id": user_id}
return self._pagination("people.getPhotos", params) return self._pagination("people.getPhotos", params)
def photos_getAllContexts(self, photo_id):
"""Returns all visible sets and pools the photo belongs to."""
params = {"photo_id": photo_id}
data = self._call("photos.getAllContexts", params)
del data["stat"]
return data
def photos_getExif(self, photo_id): def photos_getExif(self, photo_id):
"""Retrieves a list of EXIF/TIFF/GPS tags for a given photo.""" """Retrieves a list of EXIF/TIFF/GPS tags for a given photo."""
params = {"photo_id": photo_id} params = {"photo_id": photo_id}
@ -444,6 +455,8 @@ class FlickrAPI(oauth.OAuth1API):
if self.exif: if self.exif:
photo.update(self.photos_getExif(photo["id"])) photo.update(self.photos_getExif(photo["id"]))
if self.contexts:
photo.update(self.photos_getAllContexts(photo["id"]))
photo["id"] = text.parse_int(photo["id"]) photo["id"] = text.parse_int(photo["id"])
if "owner" in photo: if "owner" in photo:

View File

@ -117,8 +117,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
def __init__(self, match): def __init__(self, match):
FoolfuukaExtractor.__init__(self, match) FoolfuukaExtractor.__init__(self, match)
self.board = match.group(match.lastindex-1) self.board = self.groups[-2]
self.thread = match.group(match.lastindex) self.thread = self.groups[-1]
self.data = None self.data = None
def metadata(self): def metadata(self):
@ -140,20 +140,22 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor):
class FoolfuukaBoardExtractor(FoolfuukaExtractor): class FoolfuukaBoardExtractor(FoolfuukaExtractor):
"""Base extractor for FoolFuuka based boards/archives""" """Base extractor for FoolFuuka based boards/archives"""
subcategory = "board" subcategory = "board"
pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$"
example = "https://archived.moe/a/" example = "https://archived.moe/a/"
def __init__(self, match): def __init__(self, match):
FoolfuukaExtractor.__init__(self, match) FoolfuukaExtractor.__init__(self, match)
self.board = match.group(match.lastindex) self.board = self.groups[-2]
self.page = self.groups[-1]
def items(self): def items(self):
index_base = "{}/_/api/chan/index/?board={}&page=".format( index_base = "{}/_/api/chan/index/?board={}&page=".format(
self.root, self.board) self.root, self.board)
thread_base = "{}/{}/thread/".format(self.root, self.board) thread_base = "{}/{}/thread/".format(self.root, self.board)
for page in itertools.count(1): page = self.page
with self.request(index_base + format(page)) as response: for pnum in itertools.count(text.parse_int(page, 1)):
with self.request(index_base + format(pnum)) as response:
try: try:
threads = response.json() threads = response.json()
except ValueError: except ValueError:
@ -167,6 +169,9 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor):
thread["_extractor"] = FoolfuukaThreadExtractor thread["_extractor"] = FoolfuukaThreadExtractor
yield Message.Queue, thread["url"], thread yield Message.Queue, thread["url"], thread
if page:
return
class FoolfuukaSearchExtractor(FoolfuukaExtractor): class FoolfuukaSearchExtractor(FoolfuukaExtractor):
"""Base extractor for search results on FoolFuuka based boards/archives""" """Base extractor for search results on FoolFuuka based boards/archives"""
@ -179,17 +184,16 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor):
def __init__(self, match): def __init__(self, match):
FoolfuukaExtractor.__init__(self, match) FoolfuukaExtractor.__init__(self, match)
self.params = params = {} self.params = params = {}
args = match.group(match.lastindex).split("/")
key = None
for arg in args: key = None
for arg in self.groups[-1].split("/"):
if key: if key:
params[key] = text.unescape(arg) params[key] = text.unescape(arg)
key = None key = None
else: else:
key = arg key = arg
board = match.group(match.lastindex-1) board = self.groups[-2]
if board != "_": if board != "_":
params["boards"] = board params["boards"] = board

View File

@ -11,7 +11,7 @@
from .common import Extractor, Message from .common import Extractor, Message
from .. import text, util from .. import text, util
BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net" BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net"
class FuraffinityExtractor(Extractor): class FuraffinityExtractor(Extractor):

View File

@ -32,6 +32,9 @@ class GelbooruBase():
url = self.root + "/index.php?page=dapi&q=index&json=1" url = self.root + "/index.php?page=dapi&q=index&json=1"
data = self.request(url, params=params).json() data = self.request(url, params=params).json()
if not key:
return data
try: try:
posts = data[key] posts = data[key]
except KeyError: except KeyError:
@ -48,19 +51,44 @@ class GelbooruBase():
params["pid"] = self.page_start params["pid"] = self.page_start
params["limit"] = self.per_page params["limit"] = self.per_page
limit = self.per_page // 2 limit = self.per_page // 2
pid = False
if "tags" in params:
tags = params["tags"].split()
op = "<"
id = False
for tag in tags:
if tag.startswith("sort:"):
if tag == "sort:id:asc":
op = ">"
elif tag == "sort:id" or tag.startswith("sort:id:"):
op = "<"
else:
pid = True
elif tag.startswith("id:"):
id = True
if not pid:
if id:
tag = "id:" + op
tags = [t for t in tags if not t.startswith(tag)]
tags = "{} id:{}".format(" ".join(tags), op)
while True: while True:
posts = self._api_request(params) posts = self._api_request(params)
for post in posts: yield from posts
yield post
if len(posts) < limit: if len(posts) < limit:
return return
if "pid" in params: if pid:
del params["pid"] params["pid"] += 1
params["tags"] = "{} id:<{}".format(self.tags, post["id"]) else:
if "pid" in params:
del params["pid"]
params["tags"] = tags + str(posts[-1]["id"])
def _pagination_html(self, params): def _pagination_html(self, params):
url = self.root + "/index.php" url = self.root + "/index.php"
@ -167,13 +195,61 @@ class GelbooruFavoriteExtractor(GelbooruBase,
params = { params = {
"s" : "favorite", "s" : "favorite",
"id" : self.favorite_id, "id" : self.favorite_id,
"limit": "1", "limit": "2",
} }
data = self._api_request(params, None, True)
count = self._api_request(params, "@attributes", True)[0]["count"] count = data["@attributes"]["count"]
if count <= self.offset: self.log.debug("API reports %s favorite entries", count)
return
favs = data["favorite"]
try:
order = 1 if favs[0]["id"] < favs[1]["id"] else -1
except LookupError as exc:
self.log.debug(
"Error when determining API favorite order (%s: %s)",
exc.__class__.__name__, exc)
order = -1
else:
self.log.debug("API yields favorites in %sscending order",
"a" if order > 0 else "de")
order_favs = self.config("order-posts")
if order_favs and order_favs[0] in ("r", "a"):
self.log.debug("Returning them in reverse")
order = -order
if order < 0:
return self._pagination(params, count)
return self._pagination_reverse(params, count)
def _pagination(self, params, count):
if self.offset:
pnum, skip = divmod(self.offset, self.per_page)
else:
pnum = skip = 0
params["pid"] = pnum
params["limit"] = self.per_page
while True:
favs = self._api_request(params, "favorite")
if not favs:
return
if skip:
favs = favs[skip:]
skip = 0
for fav in favs:
for post in self._api_request({"id": fav["favorite"]}):
post["date_favorited"] = text.parse_timestamp(fav["added"])
yield post
params["pid"] += 1
def _pagination_reverse(self, params, count):
pnum, last = divmod(count-1, self.per_page) pnum, last = divmod(count-1, self.per_page)
if self.offset > last: if self.offset > last:
# page number change # page number change
@ -182,12 +258,11 @@ class GelbooruFavoriteExtractor(GelbooruBase,
pnum -= diff + 1 pnum -= diff + 1
skip = self.offset skip = self.offset
# paginate over them in reverse
params["pid"] = pnum params["pid"] = pnum
params["limit"] = self.per_page params["limit"] = self.per_page
while True: while True:
favs = self._api_request(params, "favorite", True) favs = self._api_request(params, "favorite")
favs.reverse() favs.reverse()
if skip: if skip:
@ -195,7 +270,9 @@ class GelbooruFavoriteExtractor(GelbooruBase,
skip = 0 skip = 0
for fav in favs: for fav in favs:
yield from self._api_request({"id": fav["favorite"]}) for post in self._api_request({"id": fav["favorite"]}):
post["date_favorited"] = text.parse_timestamp(fav["added"])
yield post
params["pid"] -= 1 params["pid"] -= 1
if params["pid"] < 0: if params["pid"] < 0:

View File

@ -41,9 +41,13 @@ class GofileFolderExtractor(Extractor):
folder = self._get_content(self.content_id, password) folder = self._get_content(self.content_id, password)
yield Message.Directory, folder yield Message.Directory, folder
try:
contents = folder.pop("children")
except KeyError:
raise exception.AuthorizationError("Password required")
num = 0 num = 0
contents = folder.pop("contents") for content_id in folder["childrenIds"]:
for content_id in folder["childs"]:
content = contents[content_id] content = contents[content_id]
content["folder"] = folder content["folder"] = folder
@ -67,31 +71,32 @@ class GofileFolderExtractor(Extractor):
@memcache() @memcache()
def _create_account(self): def _create_account(self):
self.log.debug("Creating temporary account") self.log.debug("Creating temporary account")
return self._api_request("createAccount")["token"] return self._api_request("accounts", method="POST")["token"]
@cache(maxage=86400) @cache(maxage=86400)
def _get_website_token(self): def _get_website_token(self):
self.log.debug("Fetching website token") self.log.debug("Fetching website token")
page = self.request(self.root + "/dist/js/alljs.js").text page = self.request(self.root + "/dist/js/alljs.js").text
return text.extr(page, 'fetchData.wt = "', '"') return text.extr(page, 'wt: "', '"')
def _get_content(self, content_id, password=None): def _get_content(self, content_id, password=None):
headers = {"Authorization": "Bearer " + self.api_token}
params = {"wt": self.website_token}
if password is not None: if password is not None:
password = hashlib.sha256(password.encode()).hexdigest() params["password"] = hashlib.sha256(password.encode()).hexdigest()
return self._api_request("getContent", { return self._api_request("contents/" + content_id, params, headers)
"contentId" : content_id,
"token" : self.api_token,
"wt" : self.website_token,
"password" : password,
})
def _api_request(self, endpoint, params=None): def _api_request(self, endpoint, params=None, headers=None, method="GET"):
response = self.request( response = self.request(
"https://api.gofile.io/" + endpoint, params=params).json() "https://api.gofile.io/" + endpoint,
method=method, params=params, headers=headers,
).json()
if response["status"] != "ok": if response["status"] != "ok":
if response["status"] == "error-notFound": if response["status"] == "error-notFound":
raise exception.NotFoundError("content") raise exception.NotFoundError("content")
if response["status"] == "error-passwordRequired":
raise exception.AuthorizationError("Password required")
raise exception.StopExtraction( raise exception.StopExtraction(
"%s failed (Status: %s)", endpoint, response["status"]) "%s failed (Status: %s)", endpoint, response["status"])

View File

@ -25,7 +25,7 @@ class HiperdexBase():
@memcache(keyarg=1) @memcache(keyarg=1)
def manga_data(self, manga, page=None): def manga_data(self, manga, page=None):
if not page: if not page:
url = "{}/manga/{}/".format(self.root, manga) url = "{}/mangas/{}/".format(self.root, manga)
page = self.request(url).text page = self.request(url).text
extr = text.extract_from(page) extr = text.extract_from(page)
@ -33,7 +33,7 @@ class HiperdexBase():
"url" : text.unescape(extr( "url" : text.unescape(extr(
'property="og:url" content="', '"')), 'property="og:url" content="', '"')),
"manga" : text.unescape(extr( "manga" : text.unescape(extr(
'"headline": "', '"')), ' property="name" title="', '"')),
"score" : text.parse_float(extr( "score" : text.parse_float(extr(
'id="averagerate">', '<')), 'id="averagerate">', '<')),
"author" : text.remove_html(extr( "author" : text.remove_html(extr(
@ -68,8 +68,8 @@ class HiperdexBase():
class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
"""Extractor for manga chapters from hiperdex.com""" """Extractor for manga chapters from hiperdex.com"""
pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))"
example = "https://hiperdex.com/manga/MANGA/CHAPTER/" example = "https://hiperdex.com/mangas/MANGA/CHAPTER/"
def __init__(self, match): def __init__(self, match):
root, path, self.manga, self.chapter = match.groups() root, path, self.manga, self.chapter = match.groups()
@ -90,8 +90,8 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor):
class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor):
"""Extractor for manga from hiperdex.com""" """Extractor for manga from hiperdex.com"""
chapterclass = HiperdexChapterExtractor chapterclass = HiperdexChapterExtractor
pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$"
example = "https://hiperdex.com/manga/MANGA/" example = "https://hiperdex.com/mangas/MANGA/"
def __init__(self, match): def __init__(self, match):
root, path, self.manga = match.groups() root, path, self.manga = match.groups()

View File

@ -23,6 +23,7 @@ class HotleakExtractor(Extractor):
def items(self): def items(self):
for post in self.posts(): for post in self.posts():
post["_http_expected_status"] = (404,)
yield Message.Directory, post yield Message.Directory, post
yield Message.Url, post["url"], post yield Message.Url, post["url"], post

View File

@ -101,9 +101,8 @@ class IdolcomplexExtractor(SankakuExtractor):
page = self.request(url, retries=10).text page = self.request(url, retries=10).text
extr = text.extract_from(page) extr = text.extract_from(page)
pid_alnum = extr('/posts/', '"') vavg = extr('id="rating"', "</ul>")
vavg = extr('itemprop="ratingValue">', "<") vcnt = extr('>Votes</strong>:', "<")
vcnt = extr('itemprop="reviewCount">', "<")
pid = extr(">Post ID:", "<") pid = extr(">Post ID:", "<")
created = extr(' title="', '"') created = extr(' title="', '"')
@ -120,10 +119,10 @@ class IdolcomplexExtractor(SankakuExtractor):
rating = extr(">Rating:", "<br") rating = extr(">Rating:", "<br")
data = { data = {
"id" : text.parse_int(pid), "id" : pid.strip(),
"id_alnum" : pid_alnum,
"md5" : file_url.rpartition("/")[2].partition(".")[0], "md5" : file_url.rpartition("/")[2].partition(".")[0],
"vote_average": text.parse_float(vavg), "vote_average": (1.0 * vavg.count('class="star-full"') +
0.5 * vavg.count('class="star-half"')),
"vote_count" : text.parse_int(vcnt), "vote_count" : text.parse_int(vcnt),
"created_at" : created, "created_at" : created,
"date" : text.parse_datetime( "date" : text.parse_datetime(
@ -222,8 +221,8 @@ class IdolcomplexPoolExtractor(IdolcomplexExtractor):
subcategory = "pool" subcategory = "pool"
directory_fmt = ("{category}", "pool", "{pool}") directory_fmt = ("{category}", "pool", "{pool}")
archive_fmt = "p_{pool}_{id}" archive_fmt = "p_{pool}_{id}"
pattern = BASE_PATTERN + r"/pools?/show/(\d+)" pattern = BASE_PATTERN + r"/pools?/(?:show/)?(\w+)"
example = "https://idol.sankakucomplex.com/pools/show/12345" example = "https://idol.sankakucomplex.com/pools/0123456789abcdef"
per_page = 24 per_page = 24
def __init__(self, match): def __init__(self, match):

View File

@ -161,11 +161,12 @@ class ImagefapFolderExtractor(ImagefapExtractor):
self.user = user or profile self.user = user or profile
def items(self): def items(self):
for gallery_id, name in self.galleries(self.folder_id): for gallery_id, name, folder in self.galleries(self.folder_id):
url = "{}/gallery/{}".format(self.root, gallery_id) url = "{}/gallery/{}".format(self.root, gallery_id)
data = { data = {
"gallery_id": gallery_id, "gallery_id": gallery_id,
"title" : text.unescape(name), "title" : text.unescape(name),
"folder" : text.unescape(folder),
"_extractor": ImagefapGalleryExtractor, "_extractor": ImagefapGalleryExtractor,
} }
yield Message.Queue, url, data yield Message.Queue, url, data
@ -173,6 +174,7 @@ class ImagefapFolderExtractor(ImagefapExtractor):
def galleries(self, folder_id): def galleries(self, folder_id):
"""Yield gallery IDs and titles of a folder""" """Yield gallery IDs and titles of a folder"""
if folder_id == "-1": if folder_id == "-1":
folder_name = "Uncategorized"
if self._id: if self._id:
url = "{}/usergallery.php?userid={}&folderid=-1".format( url = "{}/usergallery.php?userid={}&folderid=-1".format(
self.root, self.user) self.root, self.user)
@ -180,23 +182,28 @@ class ImagefapFolderExtractor(ImagefapExtractor):
url = "{}/profile/{}/galleries?folderid=-1".format( url = "{}/profile/{}/galleries?folderid=-1".format(
self.root, self.user) self.root, self.user)
else: else:
folder_name = None
url = "{}/organizer/{}/".format(self.root, folder_id) url = "{}/organizer/{}/".format(self.root, folder_id)
params = {"page": 0} params = {"page": 0}
extr = text.extract_from(self.request(url, params=params).text)
if not folder_name:
folder_name = extr("class'blk_galleries'><b>", "</b>")
while True: while True:
extr = text.extract_from(self.request(url, params=params).text)
cnt = 0 cnt = 0
while True: while True:
gid = extr('<a href="/gallery/', '"') gid = extr(' id="gid-', '"')
if not gid: if not gid:
break break
yield gid, extr("<b>", "<") yield gid, extr("<b>", "<"), folder_name
cnt += 1 cnt += 1
if cnt < 20: if cnt < 20:
break break
params["page"] += 1 params["page"] += 1
extr = text.extract_from(self.request(url, params=params).text)
class ImagefapUserExtractor(ImagefapExtractor): class ImagefapUserExtractor(ImagefapExtractor):

View File

@ -39,10 +39,15 @@ class ImgurExtractor(Extractor):
image["url"] = url = "https://i.imgur.com/{}.{}".format( image["url"] = url = "https://i.imgur.com/{}.{}".format(
image["id"], image["ext"]) image["id"], image["ext"])
image["date"] = text.parse_datetime(image["created_at"]) image["date"] = text.parse_datetime(image["created_at"])
image["_http_validate"] = self._validate
text.nameext_from_url(url, image) text.nameext_from_url(url, image)
return url return url
def _validate(self, response):
return (not response.history or
not response.url.endswith("/removed.png"))
def _items_queue(self, items): def _items_queue(self, items):
album_ex = ImgurAlbumExtractor album_ex = ImgurAlbumExtractor
image_ex = ImgurImageExtractor image_ex = ImgurImageExtractor

View File

@ -330,15 +330,18 @@ class InkbunnyAPI():
def _call(self, endpoint, params): def _call(self, endpoint, params):
url = "https://inkbunny.net/api_" + endpoint + ".php" url = "https://inkbunny.net/api_" + endpoint + ".php"
params["sid"] = self.session_id params["sid"] = self.session_id
data = self.extractor.request(url, params=params).json()
if "error_code" in data: while True:
data = self.extractor.request(url, params=params).json()
if "error_code" not in data:
return data
if str(data["error_code"]) == "2": if str(data["error_code"]) == "2":
self.authenticate(invalidate=True) self.authenticate(invalidate=True)
return self._call(endpoint, params) continue
raise exception.StopExtraction(data.get("error_message"))
return data raise exception.StopExtraction(data.get("error_message"))
def _pagination_search(self, params): def _pagination_search(self, params):
params["page"] = 1 params["page"] = 1

View File

@ -165,7 +165,7 @@ class InstagramExtractor(Extractor):
data = { data = {
"post_id" : post["pk"], "post_id" : post["pk"],
"post_shortcode": post["code"], "post_shortcode": post["code"],
"likes": post.get("like_count"), "likes": post.get("like_count", 0),
"pinned": post.get("timeline_pinned_user_ids", ()), "pinned": post.get("timeline_pinned_user_ids", ()),
"date": text.parse_timestamp(post.get("taken_at")), "date": text.parse_timestamp(post.get("taken_at")),
} }
@ -736,7 +736,7 @@ class InstagramRestAPI():
not user["followed_by_viewer"]: not user["followed_by_viewer"]:
name = user["username"] name = user["username"]
s = "" if name.endswith("s") else "s" s = "" if name.endswith("s") else "s"
raise exception.StopExtraction("%s'%s posts are private", name, s) self.extractor.log.warning("%s'%s posts are private", name, s)
self.extractor._assign_user(user) self.extractor._assign_user(user)
return user["id"] return user["id"]

View File

@ -41,6 +41,9 @@ class KemonopartyExtractor(Extractor):
self.revisions = self.config("revisions") self.revisions = self.config("revisions")
if self.revisions: if self.revisions:
self.revisions_unique = (self.revisions == "unique") self.revisions_unique = (self.revisions == "unique")
order = self.config("order-revisions")
self.revisions_reverse = order[0] in ("r", "a") if order else False
self._prepare_ddosguard_cookies() self._prepare_ddosguard_cookies()
self._find_inline = re.compile( self._find_inline = re.compile(
r'src="(?:https?://(?:kemono|coomer)\.(?:su|party))?(/inline/[^"]+' r'src="(?:https?://(?:kemono|coomer)\.(?:su|party))?(/inline/[^"]+'
@ -54,7 +57,7 @@ class KemonopartyExtractor(Extractor):
generators = self._build_file_generators(self.config("files")) generators = self._build_file_generators(self.config("files"))
duplicates = self.config("duplicates") duplicates = self.config("duplicates")
comments = self.config("comments") comments = self.config("comments")
username = dms = None username = dms = announcements = None
# prevent files from being sent with gzip compression # prevent files from being sent with gzip compression
headers = {"Accept-Encoding": "identity"} headers = {"Accept-Encoding": "identity"}
@ -65,6 +68,8 @@ class KemonopartyExtractor(Extractor):
'<meta name="artist_name" content="', '"')[0]) '<meta name="artist_name" content="', '"')[0])
if self.config("dms"): if self.config("dms"):
dms = True dms = True
if self.config("announcements"):
announcements = True
posts = self.posts() posts = self.posts()
max_posts = self.config("max-posts") max_posts = self.config("max-posts")
@ -77,7 +82,7 @@ class KemonopartyExtractor(Extractor):
self.root, post["service"], post["user"], post["id"]) self.root, post["service"], post["user"], post["id"])
post["_http_headers"] = headers post["_http_headers"] = headers
post["date"] = self._parse_datetime( post["date"] = self._parse_datetime(
post["published"] or post["added"]) post.get("published") or post.get("added") or "")
if username: if username:
post["username"] = username post["username"] = username
@ -85,8 +90,12 @@ class KemonopartyExtractor(Extractor):
post["comments"] = self._extract_comments(post) post["comments"] = self._extract_comments(post)
if dms is not None: if dms is not None:
if dms is True: if dms is True:
dms = self._extract_dms(post) dms = self._extract_cards(post, "dms")
post["dms"] = dms post["dms"] = dms
if announcements is not None:
if announcements is True:
announcements = self._extract_cards(post, "announcements")
post["announcements"] = announcements
files = [] files = []
hashes = set() hashes = set()
@ -153,7 +162,7 @@ class KemonopartyExtractor(Extractor):
def _file(self, post): def _file(self, post):
file = post["file"] file = post["file"]
if not file: if not file or "path" not in file:
return () return ()
file["type"] = "file" file["type"] = "file"
return (file,) return (file,)
@ -197,21 +206,21 @@ class KemonopartyExtractor(Extractor):
}) })
return comments return comments
def _extract_dms(self, post): def _extract_cards(self, post, type):
url = "{}/{}/user/{}/dms".format( url = "{}/{}/user/{}/{}".format(
self.root, post["service"], post["user"]) self.root, post["service"], post["user"], type)
page = self.request(url).text page = self.request(url).text
dms = [] cards = []
for dm in text.extract_iter(page, "<article", "</article>"): for card in text.extract_iter(page, "<article", "</article>"):
footer = text.extr(dm, "<footer", "</footer>") footer = text.extr(card, "<footer", "</footer>")
dms.append({ cards.append({
"body": text.unescape(text.extr( "body": text.unescape(text.extr(
dm, "<pre>", "</pre></", card, "<pre>", "</pre></",
).strip()), ).strip()),
"date": text.extr(footer, 'Published: ', '\n'), "date": text.extr(footer, ': ', '\n'),
}) })
return dms return cards
def _parse_datetime(self, date_string): def _parse_datetime(self, date_string):
if len(date_string) > 19: if len(date_string) > 19:
@ -232,6 +241,7 @@ class KemonopartyExtractor(Extractor):
except exception.HttpError: except exception.HttpError:
post["revision_hash"] = self._revision_hash(post) post["revision_hash"] = self._revision_hash(post)
post["revision_index"] = 1 post["revision_index"] = 1
post["revision_count"] = 1
return (post,) return (post,)
revs.insert(0, post) revs.insert(0, post)
@ -247,22 +257,30 @@ class KemonopartyExtractor(Extractor):
uniq.append(rev) uniq.append(rev)
revs = uniq revs = uniq
idx = len(revs) cnt = idx = len(revs)
for rev in revs: for rev in revs:
rev["revision_index"] = idx rev["revision_index"] = idx
rev["revision_count"] = cnt
idx -= 1 idx -= 1
if self.revisions_reverse:
revs.reverse()
return revs return revs
def _revisions_all(self, url): def _revisions_all(self, url):
revs = self.request(url + "/revisions").json() revs = self.request(url + "/revisions").json()
idx = len(revs) cnt = idx = len(revs)
for rev in revs: for rev in revs:
rev["revision_hash"] = self._revision_hash(rev) rev["revision_hash"] = self._revision_hash(rev)
rev["revision_index"] = idx rev["revision_index"] = idx
rev["revision_count"] = cnt
idx -= 1 idx -= 1
if self.revisions_reverse:
revs.reverse()
return revs return revs
def _revision_hash(self, revision): def _revision_hash(self, revision):
@ -482,7 +500,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
def __init__(self, match): def __init__(self, match):
KemonopartyExtractor.__init__(self, match) KemonopartyExtractor.__init__(self, match)
self.favorites = (text.parse_query(match.group(3)).get("type") or self.params = text.parse_query(match.group(3))
self.favorites = (self.params.get("type") or
self.config("favorites") or self.config("favorites") or
"artist") "artist")
@ -490,9 +509,17 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
self._prepare_ddosguard_cookies() self._prepare_ddosguard_cookies()
self.login() self.login()
sort = self.params.get("sort")
order = self.params.get("order") or "desc"
if self.favorites == "artist": if self.favorites == "artist":
users = self.request( users = self.request(
self.root + "/api/v1/account/favorites?type=artist").json() self.root + "/api/v1/account/favorites?type=artist").json()
if not sort:
sort = "updated"
users.sort(key=lambda x: x[sort], reverse=(order == "desc"))
for user in users: for user in users:
user["_extractor"] = KemonopartyUserExtractor user["_extractor"] = KemonopartyUserExtractor
url = "{}/{}/user/{}".format( url = "{}/{}/user/{}".format(
@ -502,6 +529,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor):
elif self.favorites == "post": elif self.favorites == "post":
posts = self.request( posts = self.request(
self.root + "/api/v1/account/favorites?type=post").json() self.root + "/api/v1/account/favorites?type=post").json()
if not sort:
sort = "faved_seq"
posts.sort(key=lambda x: x[sort], reverse=(order == "desc"))
for post in posts: for post in posts:
post["_extractor"] = KemonopartyPostExtractor post["_extractor"] = KemonopartyPostExtractor
url = "{}/{}/user/{}/post/{}".format( url = "{}/{}/user/{}/post/{}".format(

View File

@ -104,7 +104,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor):
filename_fmt = "{category}_{id}{title:?_//}.{extension}" filename_fmt = "{category}_{id}{title:?_//}.{extension}"
directory_fmt = ("{category}",) directory_fmt = ("{category}",)
archive_fmt = "{id}" archive_fmt = "{id}"
pattern = BASE_PATTERN + r"/i/(\w+)" pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)"
example = "https://lensdump.com/i/ID" example = "https://lensdump.com/i/ID"
def __init__(self, match): def __init__(self, match):

View File

@ -29,6 +29,7 @@ class MastodonExtractor(BaseExtractor):
self.instance = self.root.partition("://")[2] self.instance = self.root.partition("://")[2]
self.reblogs = self.config("reblogs", False) self.reblogs = self.config("reblogs", False)
self.replies = self.config("replies", True) self.replies = self.config("replies", True)
self.cards = self.config("cards", False)
def items(self): def items(self):
for status in self.statuses(): for status in self.statuses():
@ -48,6 +49,17 @@ class MastodonExtractor(BaseExtractor):
if status["reblog"]: if status["reblog"]:
attachments.extend(status["reblog"]["media_attachments"]) attachments.extend(status["reblog"]["media_attachments"])
if self.cards:
card = status.get("card")
if card:
url = card.get("image")
if url:
card["weburl"] = card.get("url")
card["url"] = url
card["id"] = "card" + "".join(
url.split("/")[6:-2]).lstrip("0")
attachments.append(card)
status["instance"] = self.instance status["instance"] = self.instance
acct = status["account"]["acct"] acct = status["account"]["acct"]
status["instance_remote"] = \ status["instance_remote"] = \
@ -70,7 +82,11 @@ class MastodonExtractor(BaseExtractor):
def _check_moved(self, account): def _check_moved(self, account):
self._check_moved = None self._check_moved = None
if "moved" in account: # Certain fediverse software (such as Iceshrimp and Sharkey) have a
# null account "moved" field instead of not having it outright.
# To handle this, check if the "moved" value is truthy instead
# if only it exists.
if account.get("moved"):
self.log.warning("Account '%s' moved to '%s'", self.log.warning("Account '%s' moved to '%s'",
account["acct"], account["moved"]["acct"]) account["acct"], account["moved"]["acct"])
@ -116,6 +132,7 @@ class MastodonUserExtractor(MastodonExtractor):
api.account_id_by_username(self.item), api.account_id_by_username(self.item),
only_media=( only_media=(
not self.reblogs and not self.reblogs and
not self.cards and
not self.config("text-posts", False) not self.config("text-posts", False)
), ),
exclude_replies=not self.replies, exclude_replies=not self.replies,
@ -132,6 +149,36 @@ class MastodonBookmarkExtractor(MastodonExtractor):
return MastodonAPI(self).account_bookmarks() return MastodonAPI(self).account_bookmarks()
class MastodonFavoriteExtractor(MastodonExtractor):
"""Extractor for mastodon favorites"""
subcategory = "favorite"
pattern = BASE_PATTERN + r"/favourites"
example = "https://mastodon.social/favourites"
def statuses(self):
return MastodonAPI(self).account_favorites()
class MastodonListExtractor(MastodonExtractor):
"""Extractor for mastodon lists"""
subcategory = "list"
pattern = BASE_PATTERN + r"/lists/(\w+)"
example = "https://mastodon.social/lists/12345"
def statuses(self):
return MastodonAPI(self).timelines_list(self.item)
class MastodonHashtagExtractor(MastodonExtractor):
"""Extractor for mastodon hashtags"""
subcategory = "hashtag"
pattern = BASE_PATTERN + r"/tags/(\w+)"
example = "https://mastodon.social/tags/NAME"
def statuses(self):
return MastodonAPI(self).timelines_tag(self.item)
class MastodonFollowingExtractor(MastodonExtractor): class MastodonFollowingExtractor(MastodonExtractor):
"""Extractor for followed mastodon users""" """Extractor for followed mastodon users"""
subcategory = "following" subcategory = "following"
@ -201,37 +248,55 @@ class MastodonAPI():
raise exception.NotFoundError("account") raise exception.NotFoundError("account")
def account_bookmarks(self): def account_bookmarks(self):
"""Statuses the user has bookmarked"""
endpoint = "/v1/bookmarks" endpoint = "/v1/bookmarks"
return self._pagination(endpoint, None) return self._pagination(endpoint, None)
def account_favorites(self):
"""Statuses the user has favourited"""
endpoint = "/v1/favourites"
return self._pagination(endpoint, None)
def account_following(self, account_id): def account_following(self, account_id):
"""Accounts which the given account is following"""
endpoint = "/v1/accounts/{}/following".format(account_id) endpoint = "/v1/accounts/{}/following".format(account_id)
return self._pagination(endpoint, None) return self._pagination(endpoint, None)
def account_lookup(self, username): def account_lookup(self, username):
"""Quickly lookup a username to see if it is available"""
endpoint = "/v1/accounts/lookup" endpoint = "/v1/accounts/lookup"
params = {"acct": username} params = {"acct": username}
return self._call(endpoint, params).json() return self._call(endpoint, params).json()
def account_search(self, query, limit=40): def account_search(self, query, limit=40):
"""Search for accounts""" """Search for matching accounts by username or display name"""
endpoint = "/v1/accounts/search" endpoint = "/v1/accounts/search"
params = {"q": query, "limit": limit} params = {"q": query, "limit": limit}
return self._call(endpoint, params).json() return self._call(endpoint, params).json()
def account_statuses(self, account_id, only_media=True, def account_statuses(self, account_id, only_media=True,
exclude_replies=False): exclude_replies=False):
"""Fetch an account's statuses""" """Statuses posted to the given account"""
endpoint = "/v1/accounts/{}/statuses".format(account_id) endpoint = "/v1/accounts/{}/statuses".format(account_id)
params = {"only_media" : "1" if only_media else "0", params = {"only_media" : "true" if only_media else "false",
"exclude_replies": "1" if exclude_replies else "0"} "exclude_replies": "true" if exclude_replies else "false"}
return self._pagination(endpoint, params) return self._pagination(endpoint, params)
def status(self, status_id): def status(self, status_id):
"""Fetch a status""" """Obtain information about a status"""
endpoint = "/v1/statuses/" + status_id endpoint = "/v1/statuses/" + status_id
return self._call(endpoint).json() return self._call(endpoint).json()
def timelines_list(self, list_id):
"""View statuses in the given list timeline"""
endpoint = "/v1/timelines/list/" + list_id
return self._pagination(endpoint, None)
def timelines_tag(self, hashtag):
"""View public statuses containing the given hashtag"""
endpoint = "/v1/timelines/tag/" + hashtag
return self._pagination(endpoint, None)
def _call(self, endpoint, params=None): def _call(self, endpoint, params=None):
if endpoint.startswith("http"): if endpoint.startswith("http"):
url = endpoint url = endpoint

View File

@ -26,7 +26,8 @@ class NaverPostExtractor(NaverBase, GalleryExtractor):
"{post[date]:%Y-%m-%d} {post[title]}") "{post[date]:%Y-%m-%d} {post[title]}")
archive_fmt = "{blog[id]}_{post[num]}_{num}" archive_fmt = "{blog[id]}_{post[num]}_{num}"
pattern = (r"(?:https?://)?blog\.naver\.com/" pattern = (r"(?:https?://)?blog\.naver\.com/"
r"(?:PostView\.nhn\?blogId=(\w+)&logNo=(\d+)|(\w+)/(\d+)/?$)") r"(?:PostView\.n(?:aver|hn)\?blogId=(\w+)&logNo=(\d+)|"
r"(\w+)/(\d+)/?$)")
example = "https://blog.naver.com/BLOGID/12345" example = "https://blog.naver.com/BLOGID/12345"
def __init__(self, match): def __init__(self, match):
@ -46,8 +47,10 @@ class NaverPostExtractor(NaverBase, GalleryExtractor):
extr = text.extract_from(page) extr = text.extract_from(page)
data = { data = {
"post": { "post": {
"title" : extr('"og:title" content="', '"'), "title" : text.unescape(extr(
"description": extr('"og:description" content="', '"'), '"og:title" content="', '"')),
"description": text.unescape(extr(
'"og:description" content="', '"')).replace("&nbsp;", " "),
"num" : text.parse_int(self.post_id), "num" : text.parse_int(self.post_id),
}, },
"blog": { "blog": {
@ -62,10 +65,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor):
return data return data
def images(self, page): def images(self, page):
return [ results = []
(url.replace("://post", "://blog", 1).partition("?")[0], None) for url in text.extract_iter(page, 'data-lazy-src="', '"'):
for url in text.extract_iter(page, 'data-lazy-src="', '"') url = url.replace("://post", "://blog", 1).partition("?")[0]
] if "\ufffd" in text.unquote(url):
url = text.unquote(url, encoding="EUC-KR")
results.append((url, None))
return results
class NaverBlogExtractor(NaverBase, Extractor): class NaverBlogExtractor(NaverBase, Extractor):
@ -73,7 +79,8 @@ class NaverBlogExtractor(NaverBase, Extractor):
subcategory = "blog" subcategory = "blog"
categorytransfer = True categorytransfer = True
pattern = (r"(?:https?://)?blog\.naver\.com/" pattern = (r"(?:https?://)?blog\.naver\.com/"
r"(?:PostList.nhn\?(?:[^&#]+&)*blogId=([^&#]+)|(\w+)/?$)") r"(?:PostList\.n(?:aver|hn)\?(?:[^&#]+&)*blogId=([^&#]+)|"
r"(\w+)/?$)")
example = "https://blog.naver.com/BLOGID" example = "https://blog.naver.com/BLOGID"
def __init__(self, match): def __init__(self, match):
@ -81,12 +88,11 @@ class NaverBlogExtractor(NaverBase, Extractor):
self.blog_id = match.group(1) or match.group(2) self.blog_id = match.group(1) or match.group(2)
def items(self): def items(self):
# fetch first post number # fetch first post number
url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id) url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id)
post_num = text.extract( post_num = text.extr(
self.request(url).text, 'gnFirstLogNo = "', '"', self.request(url).text, 'gnFirstLogNo = "', '"',
)[0] )
# setup params for API calls # setup params for API calls
url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root) url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root)

View File

@ -110,7 +110,7 @@ class OAuthBase(Extractor):
# get a request token # get a request token
params = {"oauth_callback": self.redirect_uri} params = {"oauth_callback": self.redirect_uri}
data = self.session.get(request_token_url, params=params).text data = self.request(request_token_url, params=params).text
data = text.parse_query(data) data = text.parse_query(data)
self.session.auth.token_secret = data["oauth_token_secret"] self.session.auth.token_secret = data["oauth_token_secret"]
@ -120,7 +120,7 @@ class OAuthBase(Extractor):
data = self.open(authorize_url, params) data = self.open(authorize_url, params)
# exchange the request token for an access token # exchange the request token for an access token
data = self.session.get(access_token_url, params=data).text data = self.request(access_token_url, params=data).text
data = text.parse_query(data) data = text.parse_query(data)
token = data["oauth_token"] token = data["oauth_token"]
token_secret = data["oauth_token_secret"] token_secret = data["oauth_token_secret"]
@ -189,7 +189,8 @@ class OAuthBase(Extractor):
data["client_id"] = client_id data["client_id"] = client_id
data["client_secret"] = client_secret data["client_secret"] = client_secret
data = self.session.post(token_url, data=data, auth=auth).json() data = self.request(
token_url, method="POST", data=data, auth=auth).json()
# check token response # check token response
if "error" in data: if "error" in data:
@ -386,7 +387,7 @@ class OAuthMastodon(OAuthBase):
"redirect_uris": self.redirect_uri, "redirect_uris": self.redirect_uri,
"scopes": "read", "scopes": "read",
} }
data = self.session.post(url, data=data).json() data = self.request(url, method="POST", data=data).json()
if "client_id" not in data or "client_secret" not in data: if "client_id" not in data or "client_secret" not in data:
raise exception.StopExtraction( raise exception.StopExtraction(
@ -441,7 +442,8 @@ class OAuthPixiv(OAuthBase):
"redirect_uri" : "https://app-api.pixiv.net" "redirect_uri" : "https://app-api.pixiv.net"
"/web/v1/users/auth/pixiv/callback", "/web/v1/users/auth/pixiv/callback",
} }
data = self.session.post(url, headers=headers, data=data).json() data = self.request(
url, method="POST", headers=headers, data=data).json()
if "error" in data: if "error" in data:
stdout_write("\n{}\n".format(data)) stdout_write("\n{}\n".format(data))

View File

@ -104,8 +104,9 @@ class PixivExtractor(Extractor):
elif work["page_count"] == 1: elif work["page_count"] == 1:
url = meta_single_page["original_image_url"] url = meta_single_page["original_image_url"]
if url == url_sanity: if url == url_sanity:
self.log.debug("Skipping 'sanity_level' warning (%s)", self.log.warning(
work["id"]) "Unable to download work %s ('sanity_level' warning)",
work["id"])
continue continue
work["date_url"] = self._date_from_url(url) work["date_url"] = self._date_from_url(url)
yield Message.Url, url, text.nameext_from_url(url, work) yield Message.Url, url, text.nameext_from_url(url, work)
@ -619,6 +620,7 @@ class PixivNovelExtractor(PixivExtractor):
meta_user = self.config("metadata") meta_user = self.config("metadata")
meta_bookmark = self.config("metadata-bookmark") meta_bookmark = self.config("metadata-bookmark")
embeds = self.config("embeds") embeds = self.config("embeds")
covers = self.config("covers")
if embeds: if embeds:
headers = { headers = {
@ -650,7 +652,7 @@ class PixivNovelExtractor(PixivExtractor):
yield Message.Directory, novel yield Message.Directory, novel
try: try:
content = self.api.novel_text(novel["id"])["novel_text"] content = self.api.novel_webview(novel["id"])["text"]
except Exception: except Exception:
self.log.warning("Unable to download novel %s", novel["id"]) self.log.warning("Unable to download novel %s", novel["id"])
continue continue
@ -658,12 +660,25 @@ class PixivNovelExtractor(PixivExtractor):
novel["extension"] = "txt" novel["extension"] = "txt"
yield Message.Url, "text:" + content, novel yield Message.Url, "text:" + content, novel
if covers:
path = novel["image_urls"]["large"].partition("/img/")[2]
url = ("https://i.pximg.net/novel-cover-original/img/" +
path.rpartition(".")[0].replace("_master1200", ""))
novel["date_url"] = self._date_from_url(url)
novel["num"] += 1
novel["suffix"] = "_p{:02}".format(novel["num"])
novel["_fallback"] = (url + ".png",)
url_jpg = url + ".jpg"
text.nameext_from_url(url_jpg, novel)
yield Message.Url, url_jpg, novel
del novel["_fallback"]
if embeds: if embeds:
desktop = False desktop = False
illusts = {} illusts = {}
for marker in text.extract_iter(content, "[", "]"): for marker in text.extract_iter(content, "[", "]"):
if marker.startswith("[jumpuri:If you would like to "): if marker.startswith("uploadedimage:"):
desktop = True desktop = True
elif marker.startswith("pixivimage:"): elif marker.startswith("pixivimage:"):
illusts[marker[11:].partition("-")[0]] = None illusts[marker[11:].partition("-")[0]] = None
@ -918,6 +933,15 @@ class PixivAppAPI():
params = {"novel_id": novel_id} params = {"novel_id": novel_id}
return self._call("/v1/novel/text", params) return self._call("/v1/novel/text", params)
def novel_webview(self, novel_id):
params = {"id": novel_id, "viewer_version": "20221031_ai"}
return self._call(
"/webview/v2/novel", params, self._novel_webview_parse)
def _novel_webview_parse(self, response):
return util.json_loads(text.extr(
response.text, "novel: ", ",\n"))
def search_illust(self, word, sort=None, target=None, duration=None, def search_illust(self, word, sort=None, target=None, duration=None,
date_start=None, date_end=None): date_start=None, date_end=None):
params = {"word": word, "search_target": target, params = {"word": word, "search_target": target,
@ -962,13 +986,17 @@ class PixivAppAPI():
params = {"illust_id": illust_id} params = {"illust_id": illust_id}
return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"]
def _call(self, endpoint, params=None): def _call(self, endpoint, params=None, parse=None):
url = "https://app-api.pixiv.net" + endpoint url = "https://app-api.pixiv.net" + endpoint
while True: while True:
self.login() self.login()
response = self.extractor.request(url, params=params, fatal=False) response = self.extractor.request(url, params=params, fatal=False)
data = response.json()
if parse:
data = parse(response)
else:
data = response.json()
if "error" not in data: if "error" not in data:
return data return data

View File

@ -23,6 +23,10 @@ class PoipikuExtractor(Extractor):
archive_fmt = "{post_id}_{num}" archive_fmt = "{post_id}_{num}"
request_interval = (0.5, 1.5) request_interval = (0.5, 1.5)
def _init(self):
self.cookies.set(
"POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com")
def items(self): def items(self):
password = self.config("password", "") password = self.config("password", "")

View File

@ -143,6 +143,9 @@ class PornhubGifExtractor(PornhubExtractor):
"url" : extr('"contentUrl": "', '"'), "url" : extr('"contentUrl": "', '"'),
"date" : text.parse_datetime( "date" : text.parse_datetime(
extr('"uploadDate": "', '"'), "%Y-%m-%d"), extr('"uploadDate": "', '"'), "%Y-%m-%d"),
"viewkey" : extr('From this video: '
'<a href="/view_video.php?viewkey=', '"'),
"timestamp": extr('lass="directLink tstamp" rel="nofollow">', '<'),
"user" : text.remove_html(extr("Created by:", "</div>")), "user" : text.remove_html(extr("Created by:", "</div>")),
} }

View File

@ -35,10 +35,7 @@ class ReadcomiconlineBase():
self.log.warning( self.log.warning(
"Redirect to \n%s\nVisit this URL in your browser, solve " "Redirect to \n%s\nVisit this URL in your browser, solve "
"the CAPTCHA, and press ENTER to continue", response.url) "the CAPTCHA, and press ENTER to continue", response.url)
try: self.input()
input()
except (EOFError, OSError):
pass
else: else:
raise exception.StopExtraction( raise exception.StopExtraction(
"Redirect to \n%s\nVisit this URL in your browser and " "Redirect to \n%s\nVisit this URL in your browser and "

View File

@ -74,8 +74,8 @@ class RedditExtractor(Extractor):
yield Message.Url, url, submission yield Message.Url, url, submission
elif "gallery_data" in media: elif "gallery_data" in media:
for submission["num"], url in enumerate( for url in self._extract_gallery(media):
self._extract_gallery(media), 1): submission["num"] += 1
text.nameext_from_url(url, submission) text.nameext_from_url(url, submission)
yield Message.Url, url, submission yield Message.Url, url, submission
@ -99,7 +99,10 @@ class RedditExtractor(Extractor):
urls.append((url, submission)) urls.append((url, submission))
for comment in comments: for comment in comments:
html = comment["body_html"] or "" html = comment["body_html"] or ""
if ' href="' in html: href = (' href="' in html)
media = ("media_metadata" in comment)
if media or href:
comment["date"] = text.parse_timestamp( comment["date"] = text.parse_timestamp(
comment["created_utc"]) comment["created_utc"])
if submission: if submission:
@ -107,6 +110,14 @@ class RedditExtractor(Extractor):
data["comment"] = comment data["comment"] = comment
else: else:
data = comment data = comment
if media:
for embed in self._extract_embed(comment):
submission["num"] += 1
text.nameext_from_url(embed, submission)
yield Message.Url, embed, submission
if href:
for url in text.extract_iter(html, ' href="', '"'): for url in text.extract_iter(html, ' href="', '"'):
urls.append((url, data)) urls.append((url, data))
@ -118,6 +129,7 @@ class RedditExtractor(Extractor):
if url.startswith(( if url.startswith((
"https://www.reddit.com/message/compose", "https://www.reddit.com/message/compose",
"https://reddit.com/message/compose", "https://reddit.com/message/compose",
"https://preview.redd.it/",
)): )):
continue continue
@ -172,6 +184,27 @@ class RedditExtractor(Extractor):
submission["id"], item["media_id"]) submission["id"], item["media_id"])
self.log.debug(src) self.log.debug(src)
def _extract_embed(self, submission):
meta = submission["media_metadata"]
if not meta:
return
for mid, data in meta.items():
if data["status"] != "valid" or "s" not in data:
self.log.warning(
"embed %s: skipping item %s (status: %s)",
submission["id"], mid, data.get("status"))
continue
src = data["s"]
url = src.get("u") or src.get("gif") or src.get("mp4")
if url:
yield url.partition("?")[0].replace("/preview.", "/i.", 1)
else:
self.log.error(
"embed %s: unable to fetch download URL for item %s",
submission["id"], mid)
self.log.debug(src)
def _extract_video_ytdl(self, submission): def _extract_video_ytdl(self, submission):
return "https://www.reddit.com" + submission["permalink"] return "https://www.reddit.com" + submission["permalink"]
@ -191,6 +224,8 @@ class RedditExtractor(Extractor):
try: try:
if "reddit_video_preview" in post["preview"]: if "reddit_video_preview" in post["preview"]:
video = post["preview"]["reddit_video_preview"] video = post["preview"]["reddit_video_preview"]
if "fallback_url" in video:
yield video["fallback_url"]
if "dash_url" in video: if "dash_url" in video:
yield "ytdl:" + video["dash_url"] yield "ytdl:" + video["dash_url"]
if "hls_url" in video: if "hls_url" in video:
@ -200,6 +235,12 @@ class RedditExtractor(Extractor):
try: try:
for image in post["preview"]["images"]: for image in post["preview"]["images"]:
variants = image.get("variants")
if variants:
if "gif" in variants:
yield variants["gif"]["source"]["url"]
if "mp4" in variants:
yield variants["mp4"]["source"]["url"]
yield image["source"]["url"] yield image["source"]["url"]
except Exception as exc: except Exception as exc:
self.log.debug("%s: %s", exc.__class__.__name__, exc) self.log.debug("%s: %s", exc.__class__.__name__, exc)
@ -446,14 +487,14 @@ class RedditAPI():
remaining = response.headers.get("x-ratelimit-remaining") remaining = response.headers.get("x-ratelimit-remaining")
if remaining and float(remaining) < 2: if remaining and float(remaining) < 2:
if self._warn_429: self.log.warning("API rate limit exceeded")
self._warn_429 = False if self._warn_429 and self.client_id == self.CLIENT_ID:
self.log.info( self.log.info(
"Register your own OAuth application and use its " "Register your own OAuth application and use its "
"credentials to prevent this error: " "credentials to prevent this error: "
"https://github.com/mikf/gallery-dl/blob/master" "https://gdl-org.github.io/docs/configuration.html"
"/docs/configuration.rst" "#extractor-reddit-client-id-user-agent")
"#extractorredditclient-id--user-agent") self._warn_429 = False
self.extractor.wait( self.extractor.wait(
seconds=response.headers["x-ratelimit-reset"]) seconds=response.headers["x-ratelimit-reset"])
continue continue

View File

@ -26,10 +26,10 @@ class SkebExtractor(Extractor):
def _init(self): def _init(self):
self.thumbnails = self.config("thumbnails", False) self.thumbnails = self.config("thumbnails", False)
self.article = self.config("article", False) self.article = self.config("article", False)
self.headers = { self.headers = {"Accept": "application/json, text/plain, */*"}
"Accept" : "application/json, text/plain, */*",
"Authorization": "Bearer null", if "Authorization" not in self.session.headers:
} self.headers["Authorization"] = "Bearer null"
def request(self, url, **kwargs): def request(self, url, **kwargs):
while True: while True:
@ -55,6 +55,12 @@ class SkebExtractor(Extractor):
url = file["file_url"] url = file["file_url"]
yield Message.Url, url, text.nameext_from_url(url, post) yield Message.Url, url, text.nameext_from_url(url, post)
def _items_users(self):
base = self.root + "/@"
for user in self.users():
user["_extractor"] = SkebUserExtractor
yield Message.Queue, base + user["screen_name"], user
def posts(self): def posts(self):
"""Return post number""" """Return post number"""
@ -83,6 +89,20 @@ class SkebExtractor(Extractor):
return return
params["offset"] += 30 params["offset"] += 30
def _pagination_users(self, endpoint, params):
url = "{}/api{}".format(self.root, endpoint)
params["offset"] = 0
params["limit"] = 90
while True:
data = self.request(
url, params=params, headers=self.headers).json()
yield from data
if len(data) < params["limit"]:
return
params["offset"] += params["limit"]
def _get_post_data(self, user_name, post_num): def _get_post_data(self, user_name, post_num):
url = "{}/api/users/{}/works/{}".format( url = "{}/api/users/{}/works/{}".format(
self.root, user_name, post_num) self.root, user_name, post_num)
@ -256,22 +276,23 @@ class SkebFollowingExtractor(SkebExtractor):
pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators" pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators"
example = "https://skeb.jp/@USER/following_creators" example = "https://skeb.jp/@USER/following_creators"
def items(self): items = SkebExtractor._items_users
for user in self.users():
url = "{}/@{}".format(self.root, user["screen_name"])
user["_extractor"] = SkebUserExtractor
yield Message.Queue, url, user
def users(self): def users(self):
url = "{}/api/users/{}/following_creators".format( endpoint = "/users/{}/following_creators".format(self.user_name)
self.root, self.user_name) params = {"sort": "date"}
params = {"sort": "date", "offset": 0, "limit": 90} return self._pagination_users(endpoint, params)
while True:
data = self.request(
url, params=params, headers=self.headers).json()
yield from data
if len(data) < params["limit"]: class SkebFollowingUsersExtractor(SkebExtractor):
return """Extractor for your followed users"""
params["offset"] += params["limit"] subcategory = "following-users"
pattern = r"(?:https?://)?skeb\.jp/following_users()"
example = "https://skeb.jp/following_users"
items = SkebExtractor._items_users
def users(self):
endpoint = "/following_users"
params = {}
return self._pagination_users(endpoint, params)

View File

@ -163,6 +163,9 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor):
def assets(self): def assets(self):
endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id
asset = self._call(endpoint)["asset"] asset = self._call(endpoint)["asset"]
if asset is None:
raise exception.NotFoundError("asset ({}:{})".format(
self.asset_type, self.asset_id))
return (asset,) return (asset,)

View File

@ -175,7 +175,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor):
"author_id" : text.parse_int(extr('data-user-id="', '"')), "author_id" : text.parse_int(extr('data-user-id="', '"')),
"author_nick": text.unescape(extr('alt="', '"')), "author_nick": text.unescape(extr('alt="', '"')),
"date" : self._parse_datetime(extr( "date" : self._parse_datetime(extr(
'class="section-subtitle">', '<')), '<span class="star_link-types">', '<')),
"content" : (extr( "content" : (extr(
'<div class="post-content', '<div class="post-uploads') '<div class="post-content', '<div class="post-uploads')
.partition(">")[2]), .partition(">")[2]),

View File

@ -151,3 +151,18 @@ class TapasEpisodeExtractor(TapasExtractor):
def episode_ids(self): def episode_ids(self):
return (self.episode_id,) return (self.episode_id,)
class TapasCreatorExtractor(TapasExtractor):
subcategory = "creator"
pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)"
example = "https://tapas.io/CREATOR"
def items(self):
url = "{}/{}/series".format(self.root, self.groups[0])
page = self.request(url).text
page = text.extr(page, '<ul class="content-list-wrap', "</ul>")
data = {"_extractor": TapasSeriesExtractor}
for path in text.extract_iter(page, ' href="', '"'):
yield Message.Queue, self.root + path, data

View File

@ -447,9 +447,9 @@ class TumblrAPI(oauth.OAuth1API):
if api_key == self.API_KEY: if api_key == self.API_KEY:
self.log.info( self.log.info(
"Register your own OAuth application and use its " "Register your own OAuth application and use its "
"credentials to prevent this error: https://githu" "credentials to prevent this error: "
"b.com/mikf/gallery-dl/blob/master/docs/configurat" "https://gdl-org.github.io/docs/configuration.html"
"ion.rst#extractortumblrapi-key--api-secret") "#extractor-tumblr-api-key-api-secret")
if self.extractor.config("ratelimit") == "wait": if self.extractor.config("ratelimit") == "wait":
self.extractor.wait(seconds=reset) self.extractor.wait(seconds=reset)

View File

@ -12,11 +12,12 @@ from .common import Extractor, Message
from .. import text, util, exception from .. import text, util, exception
from ..cache import cache, memcache from ..cache import cache, memcache
import itertools import itertools
import random
import json import json
import re import re
BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?" BASE_PATTERN = (r"(?:https?://)?(?:www\.|mobile\.)?"
r"(?:(?:[fv]x)?twitter|(?:fixup)?x)\.com") r"(?:(?:[fv]x)?twitter|(?:fix(?:up|v))?x)\.com")
class TwitterExtractor(Extractor): class TwitterExtractor(Extractor):
@ -243,8 +244,8 @@ class TwitterExtractor(Extractor):
# collect URLs from entities # collect URLs from entities
for url in tweet["entities"].get("urls") or (): for url in tweet["entities"].get("urls") or ():
url = url["expanded_url"] url = url.get("expanded_url") or url.get("url") or ""
if "//twitpic.com/" not in url or "/photos/" in url: if not url or "//twitpic.com/" not in url or "/photos/" in url:
continue continue
if url.startswith("http:"): if url.startswith("http:"):
url = "https" + url[4:] url = "https" + url[4:]
@ -336,10 +337,20 @@ class TwitterExtractor(Extractor):
urls = entities.get("urls") urls = entities.get("urls")
if urls: if urls:
for url in urls: for url in urls:
content = content.replace(url["url"], url["expanded_url"]) try:
content = content.replace(url["url"], url["expanded_url"])
except KeyError:
pass
txt, _, tco = content.rpartition(" ") txt, _, tco = content.rpartition(" ")
tdata["content"] = txt if tco.startswith("https://t.co/") else content tdata["content"] = txt if tco.startswith("https://t.co/") else content
if "birdwatch_pivot" in tweet:
try:
tdata["birdwatch"] = \
tweet["birdwatch_pivot"]["subtitle"]["text"]
except KeyError:
self.log.debug("Unable to extract 'birdwatch' note from %s",
tweet["birdwatch_pivot"])
if "in_reply_to_screen_name" in legacy: if "in_reply_to_screen_name" in legacy:
tdata["reply_to"] = legacy["in_reply_to_screen_name"] tdata["reply_to"] = legacy["in_reply_to_screen_name"]
if "quoted_by" in legacy: if "quoted_by" in legacy:
@ -380,6 +391,7 @@ class TwitterExtractor(Extractor):
"date" : text.parse_datetime( "date" : text.parse_datetime(
uget("created_at"), "%a %b %d %H:%M:%S %z %Y"), uget("created_at"), "%a %b %d %H:%M:%S %z %Y"),
"verified" : uget("verified", False), "verified" : uget("verified", False),
"protected" : uget("protected", False),
"profile_banner" : uget("profile_banner_url", ""), "profile_banner" : uget("profile_banner_url", ""),
"profile_image" : uget( "profile_image" : uget(
"profile_image_url_https", "").replace("_normal.", "."), "profile_image_url_https", "").replace("_normal.", "."),
@ -395,7 +407,10 @@ class TwitterExtractor(Extractor):
urls = entities["description"].get("urls") urls = entities["description"].get("urls")
if urls: if urls:
for url in urls: for url in urls:
descr = descr.replace(url["url"], url["expanded_url"]) try:
descr = descr.replace(url["url"], url["expanded_url"])
except KeyError:
pass
udata["description"] = descr udata["description"] = descr
if "url" in entities: if "url" in entities:
@ -731,9 +746,10 @@ class TwitterEventExtractor(TwitterExtractor):
class TwitterTweetExtractor(TwitterExtractor): class TwitterTweetExtractor(TwitterExtractor):
"""Extractor for images from individual tweets""" """Extractor for individual tweets"""
subcategory = "tweet" subcategory = "tweet"
pattern = BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)" pattern = (BASE_PATTERN + r"/([^/?#]+|i/web)/status/(\d+)"
r"/?(?:$|\?|#|photo/)")
example = "https://twitter.com/USER/status/12345" example = "https://twitter.com/USER/status/12345"
def __init__(self, match): def __init__(self, match):
@ -810,6 +826,18 @@ class TwitterTweetExtractor(TwitterExtractor):
return itertools.chain(buffer, tweets) return itertools.chain(buffer, tweets)
class TwitterQuotesExtractor(TwitterExtractor):
"""Extractor for quotes of a Tweet"""
subcategory = "quotes"
pattern = BASE_PATTERN + r"/(?:[^/?#]+|i/web)/status/(\d+)/quotes"
example = "https://twitter.com/USER/status/12345/quotes"
def items(self):
url = "{}/search?q=quoted_tweet_id:{}".format(self.root, self.user)
data = {"_extractor": TwitterSearchExtractor}
yield Message.Queue, url, data
class TwitterAvatarExtractor(TwitterExtractor): class TwitterAvatarExtractor(TwitterExtractor):
subcategory = "avatar" subcategory = "avatar"
filename_fmt = "avatar {date}.{extension}" filename_fmt = "avatar {date}.{extension}"
@ -882,6 +910,7 @@ class TwitterAPI():
def __init__(self, extractor): def __init__(self, extractor):
self.extractor = extractor self.extractor = extractor
self.log = extractor.log
self.root = "https://twitter.com/i/api" self.root = "https://twitter.com/i/api"
self._nsfw_warning = True self._nsfw_warning = True
@ -1244,7 +1273,7 @@ class TwitterAPI():
@cache(maxage=3600) @cache(maxage=3600)
def _guest_token(self): def _guest_token(self):
endpoint = "/1.1/guest/activate.json" endpoint = "/1.1/guest/activate.json"
self.extractor.log.info("Requesting guest token") self.log.info("Requesting guest token")
return str(self._call( return str(self._call(
endpoint, None, "POST", False, "https://api.twitter.com", endpoint, None, "POST", False, "https://api.twitter.com",
)["guest_token"]) )["guest_token"])
@ -1272,45 +1301,72 @@ class TwitterAPI():
if csrf_token: if csrf_token:
self.headers["x-csrf-token"] = csrf_token self.headers["x-csrf-token"] = csrf_token
if response.status_code < 400: remaining = int(response.headers.get("x-rate-limit-remaining", 6))
data = response.json() if remaining < 6 and remaining <= random.randrange(1, 6):
if not data.get("errors") or not any( self._handle_ratelimit(response)
(e.get("message") or "").lower().startswith("timeout")
for e in data["errors"]):
return data # success or non-timeout errors
msg = data["errors"][0].get("message") or "Unspecified"
self.extractor.log.debug("Internal Twitter error: '%s'", msg)
if self.headers["x-twitter-auth-type"]:
self.extractor.log.debug("Retrying API request")
continue # retry
# fall through to "Login Required"
response.status_code = 404
if response.status_code == 429:
# rate limit exceeded
if self.extractor.config("ratelimit") == "abort":
raise exception.StopExtraction("Rate limit exceeded")
until = response.headers.get("x-rate-limit-reset")
seconds = None if until else 60
self.extractor.wait(until=until, seconds=seconds)
continue continue
if response.status_code in (403, 404) and \ try:
data = response.json()
except ValueError:
data = {"errors": ({"message": response.text},)}
errors = data.get("errors")
if not errors:
return data
retry = False
for error in errors:
msg = error.get("message") or "Unspecified"
self.log.debug("API error: '%s'", msg)
if "this account is temporarily locked" in msg:
msg = "Account temporarily locked"
if self.extractor.config("locked") != "wait":
raise exception.AuthorizationError(msg)
self.log.warning(msg)
self.extractor.input("Press ENTER to retry.")
retry = True
elif "Could not authenticate you" in msg:
if not self.extractor.config("relogin", True):
continue
username, password = self.extractor._get_auth_info()
if not username:
continue
_login_impl.invalidate(username)
self.extractor.cookies_update(
_login_impl(self.extractor, username, password))
self.__init__(self.extractor)
retry = True
elif msg.lower().startswith("timeout"):
retry = True
if retry:
if self.headers["x-twitter-auth-type"]:
self.log.debug("Retrying API request")
continue
else:
# fall through to "Login Required"
response.status_code = 404
if response.status_code < 400:
return data
elif response.status_code in (403, 404) and \
not self.headers["x-twitter-auth-type"]: not self.headers["x-twitter-auth-type"]:
raise exception.AuthorizationError("Login required") raise exception.AuthorizationError("Login required")
elif response.status_code == 429:
self._handle_ratelimit(response)
continue
# error # error
try: try:
data = response.json() errors = ", ".join(e["message"] for e in errors)
errors = ", ".join(e["message"] for e in data["errors"])
except ValueError:
errors = response.text
except Exception: except Exception:
errors = data.get("errors", "") pass
raise exception.StopExtraction( raise exception.StopExtraction(
"%s %s (%s)", response.status_code, response.reason, errors) "%s %s (%s)", response.status_code, response.reason, errors)
@ -1374,7 +1430,7 @@ class TwitterAPI():
try: try:
tweet = tweets[tweet_id] tweet = tweets[tweet_id]
except KeyError: except KeyError:
self.extractor.log.debug("Skipping %s (deleted)", tweet_id) self.log.debug("Skipping %s (deleted)", tweet_id)
continue continue
if "retweeted_status_id_str" in tweet: if "retweeted_status_id_str" in tweet:
@ -1606,8 +1662,10 @@ class TwitterAPI():
variables["cursor"] = cursor variables["cursor"] = cursor
def _pagination_users(self, endpoint, variables, path=None): def _pagination_users(self, endpoint, variables, path=None):
params = {"variables": None, params = {
"features" : self._json_dumps(self.features_pagination)} "variables": None,
"features" : self._json_dumps(self.features_pagination),
}
while True: while True:
cursor = entry = None cursor = entry = None
@ -1644,6 +1702,13 @@ class TwitterAPI():
return return
variables["cursor"] = cursor variables["cursor"] = cursor
def _handle_ratelimit(self, response):
if self.extractor.config("ratelimit") == "abort":
raise exception.StopExtraction("Rate limit exceeded")
until = response.headers.get("x-rate-limit-reset")
self.extractor.wait(until=until, seconds=None if until else 60)
def _process_tombstone(self, entry, tombstone): def _process_tombstone(self, entry, tombstone):
text = (tombstone.get("richText") or tombstone["text"])["text"] text = (tombstone.get("richText") or tombstone["text"])["text"]
tweet_id = entry["entryId"].rpartition("-")[2] tweet_id = entry["entryId"].rpartition("-")[2]
@ -1651,30 +1716,30 @@ class TwitterAPI():
if text.startswith("Age-restricted"): if text.startswith("Age-restricted"):
if self._nsfw_warning: if self._nsfw_warning:
self._nsfw_warning = False self._nsfw_warning = False
self.extractor.log.warning('"%s"', text) self.log.warning('"%s"', text)
self.extractor.log.debug("Skipping %s (\"%s\")", tweet_id, text) self.log.debug("Skipping %s ('%s')", tweet_id, text)
@cache(maxage=365*86400, keyarg=1) @cache(maxage=365*86400, keyarg=1)
def _login_impl(extr, username, password): def _login_impl(extr, username, password):
import re def process(data, params=None):
import random response = extr.request(
url, params=params, headers=headers, json=data,
method="POST", fatal=None)
if re.fullmatch(r"[\w.%+-]+@[\w.-]+\.\w{2,}", username):
extr.log.warning(
"Login with email is no longer possible. "
"You need to provide your username or phone number instead.")
def process(response):
try: try:
data = response.json() data = response.json()
except ValueError: except ValueError:
data = {"errors": ({"message": "Invalid response"},)} data = {"errors": ({"message": "Invalid response"},)}
else: else:
if response.status_code < 400: if response.status_code < 400:
return data["flow_token"] try:
return (data["flow_token"],
data["subtasks"][0]["subtask_id"])
except LookupError:
pass
errors = [] errors = []
for error in data.get("errors") or (): for error in data.get("errors") or ():
@ -1683,9 +1748,13 @@ def _login_impl(extr, username, password):
extr.log.debug(response.text) extr.log.debug(response.text)
raise exception.AuthenticationError(", ".join(errors)) raise exception.AuthenticationError(", ".join(errors))
extr.cookies.clear() cookies = extr.cookies
cookies.clear()
api = TwitterAPI(extr) api = TwitterAPI(extr)
api._authenticate_guest() api._authenticate_guest()
url = "https://api.twitter.com/1.1/onboarding/task.json"
params = {"flow_name": "login"}
headers = api.headers headers = api.headers
extr.log.info("Logging in as %s", username) extr.log.info("Logging in as %s", username)
@ -1742,31 +1811,18 @@ def _login_impl(extr, username, password):
"web_modal": 1, "web_modal": 1,
}, },
} }
url = "https://api.twitter.com/1.1/onboarding/task.json?flow_name=login"
response = extr.request(url, method="POST", headers=headers, json=data)
data = { flow_token, subtask = process(data, params)
"flow_token": process(response), while not cookies.get("auth_token"):
"subtask_inputs": [ if subtask == "LoginJsInstrumentationSubtask":
{ data = {
"subtask_id": "LoginJsInstrumentationSubtask",
"js_instrumentation": { "js_instrumentation": {
"response": "{}", "response": "{}",
"link": "next_link", "link": "next_link",
}, },
}, }
], elif subtask == "LoginEnterUserIdentifierSSO":
} data = {
url = "https://api.twitter.com/1.1/onboarding/task.json"
response = extr.request(
url, method="POST", headers=headers, json=data, fatal=None)
# username
data = {
"flow_token": process(response),
"subtask_inputs": [
{
"subtask_id": "LoginEnterUserIdentifierSSO",
"settings_list": { "settings_list": {
"setting_responses": [ "setting_responses": [
{ {
@ -1778,48 +1834,61 @@ def _login_impl(extr, username, password):
], ],
"link": "next_link", "link": "next_link",
}, },
}, }
], elif subtask == "LoginEnterPassword":
} data = {
# url = "https://api.twitter.com/1.1/onboarding/task.json"
extr.sleep(random.uniform(2.0, 4.0), "login (username)")
response = extr.request(
url, method="POST", headers=headers, json=data, fatal=None)
# password
data = {
"flow_token": process(response),
"subtask_inputs": [
{
"subtask_id": "LoginEnterPassword",
"enter_password": { "enter_password": {
"password": password, "password": password,
"link": "next_link", "link": "next_link",
}, },
}, }
], elif subtask == "LoginEnterAlternateIdentifierSubtask":
} alt = extr.input(
# url = "https://api.twitter.com/1.1/onboarding/task.json" "Alternate Identifier (username, email, phone number): ")
extr.sleep(random.uniform(2.0, 4.0), "login (password)") data = {
response = extr.request( "enter_text": {
url, method="POST", headers=headers, json=data, fatal=None) "text": alt,
"link": "next_link",
# account duplication check ? },
data = { }
"flow_token": process(response), elif subtask == "LoginTwoFactorAuthChallenge":
"subtask_inputs": [ data = {
{ "enter_text": {
"subtask_id": "AccountDuplicationCheck", "text": extr.input("2FA Token: "),
"link": "next_link",
},
}
elif subtask == "LoginAcid":
data = {
"enter_text": {
"text": extr.input("Email Verification Code: "),
"link": "next_link",
},
}
elif subtask == "AccountDuplicationCheck":
data = {
"check_logged_in_account": { "check_logged_in_account": {
"link": "AccountDuplicationCheck_false", "link": "AccountDuplicationCheck_false",
}, },
}, }
], elif subtask == "ArkoseLogin":
} raise exception.AuthenticationError("Login requires CAPTCHA")
# url = "https://api.twitter.com/1.1/onboarding/task.json" elif subtask == "DenyLoginSubtask":
response = extr.request( raise exception.AuthenticationError("Login rejected as suspicious")
url, method="POST", headers=headers, json=data, fatal=None) elif subtask == "ArkoseLogin":
process(response) raise exception.AuthenticationError("No auth token cookie")
else:
raise exception.StopExtraction("Unrecognized subtask %s", subtask)
inputs = {"subtask_id": subtask}
inputs.update(data)
data = {
"flow_token": flow_token,
"subtask_inputs": [inputs],
}
extr.sleep(random.uniform(1.0, 3.0), "login ({})".format(subtask))
flow_token, subtask = process(data)
return { return {
cookie.name: cookie.value cookie.name: cookie.value

View File

@ -26,17 +26,39 @@ class VipergirlsExtractor(Extractor):
cookies_domain = ".vipergirls.to" cookies_domain = ".vipergirls.to"
cookies_names = ("vg_userid", "vg_password") cookies_names = ("vg_userid", "vg_password")
def _init(self):
domain = self.config("domain")
if domain:
self.root = text.ensure_http_scheme(domain)
def items(self): def items(self):
self.login() self.login()
posts = self.posts()
for post in self.posts(): like = self.config("like")
if like:
user_hash = posts[0].get("hash")
if len(user_hash) < 16:
self.log.warning("Login required to like posts")
like = False
posts = posts.iter("post")
if self.page:
util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15)
for post in posts:
data = post.attrib data = post.attrib
data["thread_id"] = self.thread_id data["thread_id"] = self.thread_id
yield Message.Directory, data yield Message.Directory, data
image = None
for image in post: for image in post:
yield Message.Queue, image.attrib["main_url"], data yield Message.Queue, image.attrib["main_url"], data
if image is not None and like:
self.like(post, user_hash)
def login(self): def login(self):
if self.cookies_check(self.cookies_names): if self.cookies_check(self.cookies_names):
return return
@ -64,6 +86,17 @@ class VipergirlsExtractor(Extractor):
return {cookie.name: cookie.value return {cookie.name: cookie.value
for cookie in response.cookies} for cookie in response.cookies}
def like(self, post, user_hash):
url = self.root + "/post_thanks.php"
params = {
"do" : "post_thanks_add",
"p" : post.get("id"),
"securitytoken": user_hash,
}
with self.request(url, params=params, allow_redirects=False):
pass
class VipergirlsThreadExtractor(VipergirlsExtractor): class VipergirlsThreadExtractor(VipergirlsExtractor):
"""Extractor for vipergirls threads""" """Extractor for vipergirls threads"""
@ -77,12 +110,7 @@ class VipergirlsThreadExtractor(VipergirlsExtractor):
def posts(self): def posts(self):
url = "{}/vr.php?t={}".format(self.root, self.thread_id) url = "{}/vr.php?t={}".format(self.root, self.thread_id)
root = ElementTree.fromstring(self.request(url).text) return ElementTree.fromstring(self.request(url).text)
posts = root.iter("post")
if self.page:
util.advance(posts, (text.parse_int(self.page[5:]) - 1) * 15)
return posts
class VipergirlsPostExtractor(VipergirlsExtractor): class VipergirlsPostExtractor(VipergirlsExtractor):
@ -95,8 +123,8 @@ class VipergirlsPostExtractor(VipergirlsExtractor):
def __init__(self, match): def __init__(self, match):
VipergirlsExtractor.__init__(self, match) VipergirlsExtractor.__init__(self, match)
self.thread_id, self.post_id = match.groups() self.thread_id, self.post_id = match.groups()
self.page = 0
def posts(self): def posts(self):
url = "{}/vr.php?p={}".format(self.root, self.post_id) url = "{}/vr.php?p={}".format(self.root, self.post_id)
root = ElementTree.fromstring(self.request(url).text) return ElementTree.fromstring(self.request(url).text)
return root.iter("post")

View File

@ -46,6 +46,8 @@ class VscoExtractor(Extractor):
url = "https://image-{}.vsco.co/{}".format(cdn, path) url = "https://image-{}.vsco.co/{}".format(cdn, path)
elif cdn.isdecimal(): elif cdn.isdecimal():
url = "https://image.vsco.co/" + base url = "https://image.vsco.co/" + base
elif img["responsive_url"].startswith("http"):
url = img["responsive_url"]
else: else:
url = "https://" + img["responsive_url"] url = "https://" + img["responsive_url"]
@ -238,6 +240,34 @@ class VscoSpacesExtractor(VscoExtractor):
yield Message.Queue, url, space yield Message.Queue, url, space
class VscoAvatarExtractor(VscoExtractor):
"""Extractor for vsco.co user avatars"""
subcategory = "avatar"
pattern = USER_PATTERN + r"/avatar"
example = "https://vsco.co/USER/avatar"
def images(self):
url = "{}/{}/gallery".format(self.root, self.user)
page = self.request(url).text
piid = text.extr(page, '"profileImageId":"', '"')
url = "https://im.vsco.co/" + piid
# needs GET request, since HEAD does not redirect to full URL
response = self.request(url, allow_redirects=False)
return ({
"_id" : piid,
"is_video" : False,
"grid_name" : "",
"upload_date" : 0,
"responsive_url": response.headers["Location"],
"video_url" : "",
"image_meta" : None,
"width" : 0,
"height" : 0,
},)
class VscoImageExtractor(VscoExtractor): class VscoImageExtractor(VscoExtractor):
"""Extractor for individual images on vsco.co""" """Extractor for individual images on vsco.co"""
subcategory = "image" subcategory = "image"

View File

@ -50,7 +50,7 @@ class WarosuThreadExtractor(Extractor):
title = text.unescape(text.extr(page, "class=filetitle>", "<")) title = text.unescape(text.extr(page, "class=filetitle>", "<"))
return { return {
"board" : self.board, "board" : self.board,
"board_name": boardname.rpartition(" - ")[2], "board_name": boardname.split(" - ")[1],
"thread" : self.thread, "thread" : self.thread,
"title" : title, "title" : title,
} }
@ -64,8 +64,7 @@ class WarosuThreadExtractor(Extractor):
def parse(self, post): def parse(self, post):
"""Build post object by extracting data from an HTML post""" """Build post object by extracting data from an HTML post"""
data = self._extract_post(post) data = self._extract_post(post)
if "<span> File:" in post: if "<span> File:" in post and self._extract_image(post, data):
self._extract_image(post, data)
part = data["image"].rpartition("/")[2] part = data["image"].rpartition("/")[2]
data["tim"], _, data["extension"] = part.partition(".") data["tim"], _, data["extension"] = part.partition(".")
data["ext"] = "." + data["extension"] data["ext"] = "." + data["extension"]
@ -91,6 +90,11 @@ class WarosuThreadExtractor(Extractor):
"", "<").rstrip().rpartition(".")[0]) "", "<").rstrip().rpartition(".")[0])
extr("<br>", "") extr("<br>", "")
data["image"] = url = extr("<a href=", ">") url = extr("<a href=", ">")
if url[0] == "/": if url:
data["image"] = self.root + url if url[0] == "/":
data["image"] = self.root + url
else:
data["image"] = url
return True
return False

View File

@ -30,9 +30,9 @@ class WeiboExtractor(Extractor):
self._prefix, self.user = match.groups() self._prefix, self.user = match.groups()
def _init(self): def _init(self):
self.retweets = self.config("retweets", True)
self.videos = self.config("videos", True)
self.livephoto = self.config("livephoto", True) self.livephoto = self.config("livephoto", True)
self.retweets = self.config("retweets", False)
self.videos = self.config("videos", True)
self.gifs = self.config("gifs", True) self.gifs = self.config("gifs", True)
self.gifs_video = (self.gifs == "video") self.gifs_video = (self.gifs == "video")
@ -59,15 +59,25 @@ class WeiboExtractor(Extractor):
for status in self.statuses(): for status in self.statuses():
files = [] if "ori_mid" in status and not self.retweets:
if self.retweets and "retweeted_status" in status: self.log.debug("Skipping %s (快转 retweet)", status["id"])
continue
if "retweeted_status" in status:
if not self.retweets:
self.log.debug("Skipping %s (retweet)", status["id"])
continue
# videos of the original post are in status
# images of the original post are in status["retweeted_status"]
files = []
self._extract_status(status, files)
self._extract_status(status["retweeted_status"], files)
if original_retweets: if original_retweets:
status = status["retweeted_status"] status = status["retweeted_status"]
self._extract_status(status, files)
else:
self._extract_status(status, files)
self._extract_status(status["retweeted_status"], files)
else: else:
files = []
self._extract_status(status, files) self._extract_status(status, files)
status["date"] = text.parse_datetime( status["date"] = text.parse_datetime(
@ -118,7 +128,7 @@ class WeiboExtractor(Extractor):
append(pic["largest"].copy()) append(pic["largest"].copy())
file = {"url": pic["video"]} file = {"url": pic["video"]}
file["filehame"], _, file["extension"] = \ file["filename"], _, file["extension"] = \
pic["video"].rpartition("%2F")[2].rpartition(".") pic["video"].rpartition("%2F")[2].rpartition(".")
append(file) append(file)
@ -176,23 +186,34 @@ class WeiboExtractor(Extractor):
data = data["data"] data = data["data"]
statuses = data["list"] statuses = data["list"]
if not statuses:
return
yield from statuses yield from statuses
if "next_cursor" in data: # videos, newvideo # videos, newvideo
if data["next_cursor"] == -1: cursor = data.get("next_cursor")
if cursor:
if cursor == -1:
return return
params["cursor"] = data["next_cursor"] params["cursor"] = cursor
elif "page" in params: # home, article continue
params["page"] += 1
elif data["since_id"]: # album # album
since_id = data.get("since_id")
if since_id:
params["sinceid"] = data["since_id"] params["sinceid"] = data["since_id"]
else: # feed, last album page continue
try:
params["since_id"] = statuses[-1]["id"] - 1 # home, article
except KeyError: if "page" in params:
if not statuses:
return return
params["page"] += 1
continue
# feed, last album page
try:
params["since_id"] = statuses[-1]["id"] - 1
except LookupError:
return
def _sina_visitor_system(self, response): def _sina_visitor_system(self, response):
self.log.info("Sina Visitor System") self.log.info("Sina Visitor System")

View File

@ -27,9 +27,9 @@ class WikimediaExtractor(BaseExtractor):
if self.category == "wikimedia": if self.category == "wikimedia":
self.category = self.root.split(".")[-2] self.category = self.root.split(".")[-2]
elif self.category == "fandom": elif self.category in ("fandom", "wikigg"):
self.category = \ self.category = "{}-{}".format(
"fandom-" + self.root.partition(".")[0].rpartition("/")[2] self.category, self.root.partition(".")[0].rpartition("/")[2])
if path.startswith("wiki/"): if path.startswith("wiki/"):
path = path[5:] path = path[5:]
@ -69,14 +69,18 @@ class WikimediaExtractor(BaseExtractor):
def items(self): def items(self):
for info in self._pagination(self.params): for info in self._pagination(self.params):
image = info["imageinfo"][0] try:
image = info["imageinfo"][0]
except LookupError:
self.log.debug("Missing 'imageinfo' for %s", info)
continue
image["metadata"] = { image["metadata"] = {
m["name"]: m["value"] m["name"]: m["value"]
for m in image["metadata"]} for m in image["metadata"] or ()}
image["commonmetadata"] = { image["commonmetadata"] = {
m["name"]: m["value"] m["name"]: m["value"]
for m in image["commonmetadata"]} for m in image["commonmetadata"] or ()}
filename = image["canonicaltitle"] filename = image["canonicaltitle"]
image["filename"], _, image["extension"] = \ image["filename"], _, image["extension"] = \
@ -148,6 +152,10 @@ BASE_PATTERN = WikimediaExtractor.update({
"root": None, "root": None,
"pattern": r"[\w-]+\.fandom\.com", "pattern": r"[\w-]+\.fandom\.com",
}, },
"wikigg": {
"root": None,
"pattern": r"\w+\.wiki\.gg",
},
"mariowiki": { "mariowiki": {
"root": "https://www.mariowiki.com", "root": "https://www.mariowiki.com",
"pattern": r"(?:www\.)?mariowiki\.com", "pattern": r"(?:www\.)?mariowiki\.com",

View File

@ -243,13 +243,12 @@ class TemplateFStringFormatter(FStringFormatter):
def parse_field_name(field_name): def parse_field_name(field_name):
if field_name[0] == "'":
return "_lit", (operator.itemgetter(field_name[1:-1]),)
first, rest = _string.formatter_field_name_split(field_name) first, rest = _string.formatter_field_name_split(field_name)
funcs = [] funcs = []
if first[0] == "'":
funcs.append(operator.itemgetter(first[1:-1]))
first = "_lit"
for is_attr, key in rest: for is_attr, key in rest:
if is_attr: if is_attr:
func = operator.attrgetter func = operator.attrgetter
@ -375,18 +374,18 @@ def _parse_offset(format_spec, default):
fmt = _build_format_func(format_spec, default) fmt = _build_format_func(format_spec, default)
if not offset or offset == "local": if not offset or offset == "local":
is_dst = time.daylight and time.localtime().tm_isdst > 0 def off(dt):
offset = -(time.altzone if is_dst else time.timezone) local = time.localtime(util.datetime_to_timestamp(dt))
return fmt(dt + datetime.timedelta(0, local.tm_gmtoff))
else: else:
hours, _, minutes = offset.partition(":") hours, _, minutes = offset.partition(":")
offset = 3600 * int(hours) offset = 3600 * int(hours)
if minutes: if minutes:
offset += 60 * (int(minutes) if offset > 0 else -int(minutes)) offset += 60 * (int(minutes) if offset > 0 else -int(minutes))
offset = datetime.timedelta(0, offset)
offset = datetime.timedelta(seconds=offset) def off(obj):
return fmt(obj + offset)
def off(obj):
return fmt(obj + offset)
return off return off

View File

@ -11,10 +11,23 @@ import errno
import logging import logging
import functools import functools
import collections import collections
from . import extractor, downloader, postprocessor
from . import config, text, util, path, formatter, output, exception, version from . import (
extractor,
downloader,
postprocessor,
archive,
config,
exception,
formatter,
output,
path,
text,
util,
version,
)
from .extractor.message import Message from .extractor.message import Message
from .output import stdout_write stdout_write = output.stdout_write
class Job(): class Job():
@ -423,6 +436,8 @@ class DownloadJob(Job):
def handle_finalize(self): def handle_finalize(self):
if self.archive: if self.archive:
if not self.status:
self.archive.finalize()
self.archive.close() self.archive.close()
pathfmt = self.pathfmt pathfmt = self.pathfmt
@ -453,9 +468,12 @@ class DownloadJob(Job):
for callback in self.hooks["skip"]: for callback in self.hooks["skip"]:
callback(pathfmt) callback(pathfmt)
if self._skipexc: if self._skipexc:
self._skipcnt += 1 if not self._skipftr or self._skipftr(pathfmt.kwdict):
if self._skipcnt >= self._skipmax: self._skipcnt += 1
raise self._skipexc() if self._skipcnt >= self._skipmax:
raise self._skipexc()
else:
self._skipcnt = 0
def download(self, url): def download(self, url):
"""Download 'url'""" """Download 'url'"""
@ -507,23 +525,28 @@ class DownloadJob(Job):
# monkey-patch method to do nothing and always return True # monkey-patch method to do nothing and always return True
self.download = pathfmt.fix_extension self.download = pathfmt.fix_extension
archive = cfg("archive") archive_path = cfg("archive")
if archive: if archive_path:
archive = util.expand_path(archive) archive_path = util.expand_path(archive_path)
archive_format = (cfg("archive-prefix", extr.category) + archive_format = (cfg("archive-prefix", extr.category) +
cfg("archive-format", extr.archive_fmt)) cfg("archive-format", extr.archive_fmt))
archive_pragma = (cfg("archive-pragma")) archive_pragma = (cfg("archive-pragma"))
try: try:
if "{" in archive: if "{" in archive_path:
archive = formatter.parse(archive).format_map(kwdict) archive_path = formatter.parse(
self.archive = util.DownloadArchive( archive_path).format_map(kwdict)
archive, archive_format, archive_pragma) if cfg("archive-mode") == "memory":
archive_cls = archive.DownloadArchiveMemory
else:
archive_cls = archive.DownloadArchive
self.archive = archive_cls(
archive_path, archive_format, archive_pragma)
except Exception as exc: except Exception as exc:
extr.log.warning( extr.log.warning(
"Failed to open download archive at '%s' (%s: %s)", "Failed to open download archive at '%s' (%s: %s)",
archive, exc.__class__.__name__, exc) archive_path, exc.__class__.__name__, exc)
else: else:
extr.log.debug("Using download archive '%s'", archive) extr.log.debug("Using download archive '%s'", archive_path)
skip = cfg("skip", True) skip = cfg("skip", True)
if skip: if skip:
@ -539,6 +562,12 @@ class DownloadJob(Job):
elif skip == "exit": elif skip == "exit":
self._skipexc = SystemExit self._skipexc = SystemExit
self._skipmax = text.parse_int(smax) self._skipmax = text.parse_int(smax)
skip_filter = cfg("skip-filter")
if skip_filter:
self._skipftr = util.compile_expression(skip_filter)
else:
self._skipftr = None
else: else:
# monkey-patch methods to always return False # monkey-patch methods to always return False
pathfmt.exists = lambda x=None: False pathfmt.exists = lambda x=None: False

View File

@ -249,6 +249,12 @@ def build_parser():
action="store_const", const=logging.ERROR, action="store_const", const=logging.ERROR,
help="Activate quiet mode", help="Activate quiet mode",
) )
output.add_argument(
"-w", "--warning",
dest="loglevel",
action="store_const", const=logging.WARNING,
help="Print only warnings and errors",
)
output.add_argument( output.add_argument(
"-v", "--verbose", "-v", "--verbose",
dest="loglevel", dest="loglevel",
@ -319,6 +325,11 @@ def build_parser():
help=("Write downloaded intermediary pages to files " help=("Write downloaded intermediary pages to files "
"in the current directory to debug problems"), "in the current directory to debug problems"),
) )
output.add_argument(
"--no-colors",
dest="colors", action="store_false",
help=("Do not emit ANSI color codes in output"),
)
downloader = parser.add_argument_group("Downloader Options") downloader = parser.add_argument_group("Downloader Options")
downloader.add_argument( downloader.add_argument(

View File

@ -15,12 +15,40 @@ import unicodedata
from . import config, util, formatter from . import config, util, formatter
# --------------------------------------------------------------------
# Globals
COLORS = not os.environ.get("NO_COLOR")
COLORS_DEFAULT = {
"success": "1;32",
"skip" : "2",
"debug" : "0;37",
"info" : "1;37",
"warning": "1;33",
"error" : "1;31",
} if COLORS else {}
if util.WINDOWS:
ANSI = COLORS and os.environ.get("TERM") == "ANSI"
OFFSET = 1
CHAR_SKIP = "# "
CHAR_SUCCESS = "* "
CHAR_ELLIPSIES = "..."
else:
ANSI = COLORS
OFFSET = 0
CHAR_SKIP = "# "
CHAR_SUCCESS = ""
CHAR_ELLIPSIES = ""
# -------------------------------------------------------------------- # --------------------------------------------------------------------
# Logging # Logging
LOG_FORMAT = "[{name}][{levelname}] {message}" LOG_FORMAT = "[{name}][{levelname}] {message}"
LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S" LOG_FORMAT_DATE = "%Y-%m-%d %H:%M:%S"
LOG_LEVEL = logging.INFO LOG_LEVEL = logging.INFO
LOG_LEVELS = ("debug", "info", "warning", "error")
class Logger(logging.Logger): class Logger(logging.Logger):
@ -129,7 +157,7 @@ class Formatter(logging.Formatter):
def __init__(self, fmt, datefmt): def __init__(self, fmt, datefmt):
if isinstance(fmt, dict): if isinstance(fmt, dict):
for key in ("debug", "info", "warning", "error"): for key in LOG_LEVELS:
value = fmt[key] if key in fmt else LOG_FORMAT value = fmt[key] if key in fmt else LOG_FORMAT
fmt[key] = (formatter.parse(value).format_map, fmt[key] = (formatter.parse(value).format_map,
"{asctime" in value) "{asctime" in value)
@ -187,16 +215,36 @@ def configure_logging(loglevel):
# stream logging handler # stream logging handler
handler = root.handlers[0] handler = root.handlers[0]
opts = config.interpolate(("output",), "log") opts = config.interpolate(("output",), "log")
colors = config.interpolate(("output",), "colors")
if colors is None:
colors = COLORS_DEFAULT
if colors and not opts:
opts = LOG_FORMAT
if opts: if opts:
if isinstance(opts, str): if isinstance(opts, str):
opts = {"format": opts} logfmt = opts
if handler.level == LOG_LEVEL and "level" in opts: opts = {}
elif "format" in opts:
logfmt = opts["format"]
else:
logfmt = LOG_FORMAT
if not isinstance(logfmt, dict) and colors:
ansifmt = "\033[{}m{}\033[0m".format
lf = {}
for level in LOG_LEVELS:
c = colors.get(level)
lf[level] = ansifmt(c, logfmt) if c else logfmt
logfmt = lf
handler.setFormatter(Formatter(
logfmt, opts.get("format-date", LOG_FORMAT_DATE)))
if "level" in opts and handler.level == LOG_LEVEL:
handler.setLevel(opts["level"]) handler.setLevel(opts["level"])
if "format" in opts or "format-date" in opts:
handler.setFormatter(Formatter(
opts.get("format", LOG_FORMAT),
opts.get("format-date", LOG_FORMAT_DATE),
))
if minlevel > handler.level: if minlevel > handler.level:
minlevel = handler.level minlevel = handler.level
@ -307,9 +355,12 @@ def select():
mode = config.get(("output",), "mode") mode = config.get(("output",), "mode")
if mode is None or mode == "auto": if mode is None or mode == "auto":
if hasattr(sys.stdout, "isatty") and sys.stdout.isatty(): try:
output = ColorOutput() if ANSI else TerminalOutput() if sys.stdout.isatty():
else: output = ColorOutput() if ANSI else TerminalOutput()
else:
output = PipeOutput()
except Exception:
output = PipeOutput() output = PipeOutput()
elif isinstance(mode, dict): elif isinstance(mode, dict):
output = CustomOutput(mode) output = CustomOutput(mode)
@ -388,7 +439,10 @@ class ColorOutput(TerminalOutput):
def __init__(self): def __init__(self):
TerminalOutput.__init__(self) TerminalOutput.__init__(self)
colors = config.get(("output",), "colors") or {} colors = config.interpolate(("output",), "colors")
if colors is None:
colors = COLORS_DEFAULT
self.color_skip = "\033[{}m".format( self.color_skip = "\033[{}m".format(
colors.get("skip", "2")) colors.get("skip", "2"))
self.color_success = "\r\033[{}m".format( self.color_success = "\r\033[{}m".format(
@ -514,17 +568,3 @@ def shorten_string_eaw(txt, limit, sep="…", cache=EAWCache()):
right -= 1 right -= 1
return txt[:left] + sep + txt[right+1:] return txt[:left] + sep + txt[right+1:]
if util.WINDOWS:
ANSI = os.environ.get("TERM") == "ANSI"
OFFSET = 1
CHAR_SKIP = "# "
CHAR_SUCCESS = "* "
CHAR_ELLIPSIES = "..."
else:
ANSI = True
OFFSET = 0
CHAR_SKIP = "# "
CHAR_SUCCESS = ""
CHAR_ELLIPSIES = ""

View File

@ -8,7 +8,7 @@
"""Common classes and constants used by postprocessor modules.""" """Common classes and constants used by postprocessor modules."""
from .. import util, formatter from .. import util, formatter, archive
class PostProcessor(): class PostProcessor():
@ -22,30 +22,31 @@ class PostProcessor():
return self.__class__.__name__ return self.__class__.__name__
def _init_archive(self, job, options, prefix=None): def _init_archive(self, job, options, prefix=None):
archive = options.get("archive") archive_path = options.get("archive")
if archive: if archive_path:
extr = job.extractor extr = job.extractor
archive = util.expand_path(archive) archive_path = util.expand_path(archive_path)
if not prefix: if not prefix:
prefix = "_" + self.name.upper() + "_" prefix = "_" + self.name.upper() + "_"
archive_format = ( archive_format = (
options.get("archive-prefix", extr.category) + options.get("archive-prefix", extr.category) +
options.get("archive-format", prefix + extr.archive_fmt)) options.get("archive-format", prefix + extr.archive_fmt))
try: try:
if "{" in archive: if "{" in archive_path:
archive = formatter.parse(archive).format_map( archive_path = formatter.parse(archive_path).format_map(
job.pathfmt.kwdict) job.pathfmt.kwdict)
self.archive = util.DownloadArchive( self.archive = archive.DownloadArchive(
archive, archive_format, archive_path, archive_format,
options.get("archive-pragma"), options.get("archive-pragma"),
"_archive_" + self.name) "_archive_" + self.name)
except Exception as exc: except Exception as exc:
self.log.warning( self.log.warning(
"Failed to open %s archive at '%s' (%s: %s)", "Failed to open %s archive at '%s' (%s: %s)",
self.name, archive, exc.__class__.__name__, exc) self.name, archive_path, exc.__class__.__name__, exc)
else: else:
self.log.debug("Using %s archive '%s'", self.name, archive) self.log.debug(
"Using %s archive '%s'", self.name, archive_path)
return True return True
else:
self.archive = None self.archive = None
return False return False

View File

@ -10,7 +10,6 @@
from .common import PostProcessor from .common import PostProcessor
from .. import util, formatter from .. import util, formatter
import subprocess
import os import os
import re import re
@ -80,14 +79,14 @@ class ExecPP(PostProcessor):
def _exec(self, args, shell): def _exec(self, args, shell):
self.log.debug("Running '%s'", args) self.log.debug("Running '%s'", args)
retcode = subprocess.Popen(args, shell=shell).wait() retcode = util.Popen(args, shell=shell).wait()
if retcode: if retcode:
self.log.warning("'%s' returned with non-zero exit status (%d)", self.log.warning("'%s' returned with non-zero exit status (%d)",
args, retcode) args, retcode)
def _exec_async(self, args, shell): def _exec_async(self, args, shell):
self.log.debug("Running '%s'", args) self.log.debug("Running '%s'", args)
subprocess.Popen(args, shell=shell) util.Popen(args, shell=shell)
def _replace(self, match): def _replace(self, match):
name = match.group(1) name = match.group(1)

View File

@ -33,6 +33,9 @@ class MtimePP(PostProcessor):
def run(self, pathfmt): def run(self, pathfmt):
mtime = self._get(pathfmt.kwdict) mtime = self._get(pathfmt.kwdict)
if mtime is None:
return
pathfmt.kwdict["_mtime"] = ( pathfmt.kwdict["_mtime"] = (
util.datetime_to_timestamp(mtime) util.datetime_to_timestamp(mtime)
if isinstance(mtime, datetime) else if isinstance(mtime, datetime) else

View File

@ -155,7 +155,9 @@ class UgoiraPP(PostProcessor):
self.log.error("Unable to invoke FFmpeg (%s: %s)", self.log.error("Unable to invoke FFmpeg (%s: %s)",
exc.__class__.__name__, exc) exc.__class__.__name__, exc)
pathfmt.realpath = pathfmt.temppath pathfmt.realpath = pathfmt.temppath
except Exception: except Exception as exc:
print()
self.log.error("%s: %s", exc.__class__.__name__, exc)
pathfmt.realpath = pathfmt.temppath pathfmt.realpath = pathfmt.temppath
else: else:
if self.mtime: if self.mtime:
@ -171,7 +173,7 @@ class UgoiraPP(PostProcessor):
def _exec(self, args): def _exec(self, args):
self.log.debug(args) self.log.debug(args)
out = None if self.output else subprocess.DEVNULL out = None if self.output else subprocess.DEVNULL
retcode = subprocess.Popen(args, stdout=out, stderr=out).wait() retcode = util.Popen(args, stdout=out, stderr=out).wait()
if retcode: if retcode:
print() print()
self.log.error("Non-zero exit status when running %s (%s)", self.log.error("Non-zero exit status when running %s (%s)",

View File

@ -73,7 +73,7 @@ def filename_from_url(url):
"""Extract the last part of an URL to use as a filename""" """Extract the last part of an URL to use as a filename"""
try: try:
return url.partition("?")[0].rpartition("/")[2] return url.partition("?")[0].rpartition("/")[2]
except (TypeError, AttributeError): except Exception:
return "" return ""
@ -122,7 +122,7 @@ def extract(txt, begin, end, pos=0):
first = txt.index(begin, pos) + len(begin) first = txt.index(begin, pos) + len(begin)
last = txt.index(end, first) last = txt.index(end, first)
return txt[first:last], last+len(end) return txt[first:last], last+len(end)
except (ValueError, TypeError, AttributeError): except Exception:
return None, pos return None, pos
@ -131,7 +131,7 @@ def extr(txt, begin, end, default=""):
try: try:
first = txt.index(begin) + len(begin) first = txt.index(begin) + len(begin)
return txt[first:txt.index(end, first)] return txt[first:txt.index(end, first)]
except (ValueError, TypeError, AttributeError): except Exception:
return default return default
@ -141,7 +141,7 @@ def rextract(txt, begin, end, pos=-1):
first = txt.rindex(begin, 0, pos) first = txt.rindex(begin, 0, pos)
last = txt.index(end, first + lbeg) last = txt.index(end, first + lbeg)
return txt[first + lbeg:last], first return txt[first + lbeg:last], first
except (ValueError, TypeError, AttributeError): except Exception:
return None, pos return None, pos
@ -167,7 +167,7 @@ def extract_iter(txt, begin, end, pos=0):
last = index(end, first) last = index(end, first)
pos = last + lend pos = last + lend
yield txt[first:last] yield txt[first:last]
except (ValueError, TypeError, AttributeError): except Exception:
return return
@ -180,7 +180,7 @@ def extract_from(txt, pos=0, default=""):
last = index(end, first) last = index(end, first)
pos = last + len(end) pos = last + len(end)
return txt[first:last] return txt[first:last]
except (ValueError, TypeError, AttributeError): except Exception:
return default return default
return extr return extr
@ -200,7 +200,7 @@ def parse_bytes(value, default=0, suffixes="bkmgtp"):
"""Convert a bytes-amount ("500k", "2.5M", ...) to int""" """Convert a bytes-amount ("500k", "2.5M", ...) to int"""
try: try:
last = value[-1].lower() last = value[-1].lower()
except (TypeError, LookupError): except Exception:
return default return default
if last in suffixes: if last in suffixes:
@ -221,7 +221,7 @@ def parse_int(value, default=0):
return default return default
try: try:
return int(value) return int(value)
except (ValueError, TypeError): except Exception:
return default return default
@ -231,7 +231,7 @@ def parse_float(value, default=0.0):
return default return default
try: try:
return float(value) return float(value)
except (ValueError, TypeError): except Exception:
return default return default
@ -242,7 +242,7 @@ def parse_query(qs):
for key, value in urllib.parse.parse_qsl(qs): for key, value in urllib.parse.parse_qsl(qs):
if key not in result: if key not in result:
result[key] = value result[key] = value
except AttributeError: except Exception:
pass pass
return result return result
@ -251,7 +251,7 @@ def parse_timestamp(ts, default=None):
"""Create a datetime object from a unix timestamp""" """Create a datetime object from a unix timestamp"""
try: try:
return datetime.datetime.utcfromtimestamp(int(ts)) return datetime.datetime.utcfromtimestamp(int(ts))
except (TypeError, ValueError, OverflowError): except Exception:
return default return default

View File

@ -16,7 +16,6 @@ import time
import random import random
import getpass import getpass
import hashlib import hashlib
import sqlite3
import binascii import binascii
import datetime import datetime
import functools import functools
@ -339,7 +338,7 @@ def extract_headers(response):
@functools.lru_cache(maxsize=None) @functools.lru_cache(maxsize=None)
def git_head(): def git_head():
try: try:
out, err = subprocess.Popen( out, err = Popen(
("git", "rev-parse", "--short", "HEAD"), ("git", "rev-parse", "--short", "HEAD"),
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
@ -579,6 +578,33 @@ GLOBALS = {
} }
if EXECUTABLE and hasattr(sys, "_MEIPASS"):
# https://github.com/pyinstaller/pyinstaller/blob/develop/doc
# /runtime-information.rst#ld_library_path--libpath-considerations
_popen_env = os.environ.copy()
orig = _popen_env.get("LD_LIBRARY_PATH_ORIG")
if orig is None:
_popen_env.pop("LD_LIBRARY_PATH", None)
else:
_popen_env["LD_LIBRARY_PATH"] = orig
orig = _popen_env.get("DYLD_LIBRARY_PATH_ORIG")
if orig is None:
_popen_env.pop("DYLD_LIBRARY_PATH", None)
else:
_popen_env["DYLD_LIBRARY_PATH"] = orig
del orig
class Popen(subprocess.Popen):
def __init__(self, args, **kwargs):
kwargs["env"] = _popen_env
subprocess.Popen.__init__(self, args, **kwargs)
else:
Popen = subprocess.Popen
def compile_expression(expr, name="<expr>", globals=None): def compile_expression(expr, name="<expr>", globals=None):
code_object = compile(expr, name, "eval") code_object = compile(expr, name, "eval")
return functools.partial(eval, code_object, globals or GLOBALS) return functools.partial(eval, code_object, globals or GLOBALS)
@ -825,46 +851,3 @@ class FilterPredicate():
raise raise
except Exception as exc: except Exception as exc:
raise exception.FilterError(exc) raise exception.FilterError(exc)
class DownloadArchive():
def __init__(self, path, format_string, pragma=None,
cache_key="_archive_key"):
try:
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
except sqlite3.OperationalError:
os.makedirs(os.path.dirname(path))
con = sqlite3.connect(path, timeout=60, check_same_thread=False)
con.isolation_level = None
from . import formatter
self.keygen = formatter.parse(format_string).format_map
self.close = con.close
self.cursor = cursor = con.cursor()
self._cache_key = cache_key
if pragma:
for stmt in pragma:
cursor.execute("PRAGMA " + stmt)
try:
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY) WITHOUT ROWID")
except sqlite3.OperationalError:
# fallback for missing WITHOUT ROWID support (#553)
cursor.execute("CREATE TABLE IF NOT EXISTS archive "
"(entry TEXT PRIMARY KEY)")
def check(self, kwdict):
"""Return True if the item described by 'kwdict' exists in archive"""
key = kwdict[self._cache_key] = self.keygen(kwdict)
self.cursor.execute(
"SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,))
return self.cursor.fetchone()
def add(self, kwdict):
"""Add item described by 'kwdict' to archive"""
key = kwdict.get(self._cache_key) or self.keygen(kwdict)
self.cursor.execute(
"INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,))

View File

@ -6,4 +6,4 @@
# it under the terms of the GNU General Public License version 2 as # it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation. # published by the Free Software Foundation.
__version__ = "1.26.9-dev" __version__ = "1.27.0-dev"

3
pyproject.toml Normal file
View File

@ -0,0 +1,3 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

View File

@ -4,16 +4,37 @@
"""Build a standalone executable using PyInstaller""" """Build a standalone executable using PyInstaller"""
import PyInstaller.__main__ import PyInstaller.__main__
import argparse
import util import util
import os import sys
PyInstaller.__main__.run([
"--onefile", def main():
"--console", parser = argparse.ArgumentParser()
"--name", "gallery-dl." + ("exe" if os.name == "nt" else "bin"), parser.add_argument("-o", "--os")
"--additional-hooks-dir", util.path("scripts"), parser.add_argument("-a", "--arch")
"--distpath", util.path("dist"), parser.add_argument("-e", "--extension")
"--workpath", util.path("build"), args = parser.parse_args()
"--specpath", util.path("build"),
util.path("gallery_dl", "__main__.py"), name = "gallery-dl"
]) if args.os:
name = "{}_{}".format(name, args.os.partition("-")[0].lower())
if args.arch == "x86":
name += "_x86"
if args.extension:
name = "{}.{}".format(name, args.extension.lower())
PyInstaller.__main__.run([
"--onefile",
"--console",
"--name", name,
"--additional-hooks-dir", util.path("scripts"),
"--distpath", util.path("dist"),
"--workpath", util.path("build"),
"--specpath", util.path("build"),
util.path("gallery_dl", "__main__.py"),
])
if __name__ == "__main__":
sys.exit(main())

View File

@ -44,40 +44,52 @@ update-dev() {
build-python() { build-python() {
cd "${ROOTDIR}" cd "${ROOTDIR}"
echo Building bdist_wheel and sdist echo Building sdist and wheel
python setup.py bdist_wheel sdist python -m build
} }
build-linux() { build-linux() {
cd "${ROOTDIR}" cd "${ROOTDIR}"
echo Building Linux executable echo Building Linux executable
VENV_PATH="/tmp/venv" build-vm 'ubuntu22.04' 'gallery-dl.bin'
VENV_PYTHON="${VENV_PATH}/bin/python"
rm -rf "${VENV_PATH}"
python -m virtualenv "${VENV_PATH}"
$VENV_PYTHON -m pip install requests requests[socks] yt-dlp pyyaml secretstorage pyinstaller
$VENV_PYTHON ./scripts/pyinstaller.py
} }
build-windows() { build-windows() {
cd "${ROOTDIR}/dist" cd "${ROOTDIR}"
echo Building Windows executable echo Building Windows executable
# remove old executable build-vm 'windows7_x86_sp1' 'gallery-dl.exe'
rm -f "gallery-dl.exe" }
# build windows exe in vm build-vm() {
ln -fs "${ROOTDIR}" /tmp/ VMNAME="$1"
vmstart "windows7_x86_sp1" & BINNAME="$2"
TMPPATH="/tmp/gallery-dl/dist/$BINNAME"
# launch VM
vmstart "$VMNAME" &
disown disown
while [ ! -e "gallery-dl.exe" ] ; do
# copy source files
mkdir -p /tmp/gallery-dl
cp -a -t /tmp/gallery-dl -- \
./gallery_dl ./scripts ./data ./setup.py ./README.rst
# remove old executable
rm -f "./dist/$BINNAME"
# wait for new executable
while [ ! -e "$TMPPATH" ] ; do
sleep 5 sleep 5
done done
sleep 2 sleep 2
# move
mv "$TMPPATH" "./dist/$BINNAME"
rm -r /tmp/gallery-dl
} }
sign() { sign() {
@ -100,6 +112,14 @@ changelog() {
-e "s*\([( ]\)#\([0-9]\+\)*\1[#\2](https://github.com/mikf/gallery-dl/issues/\2)*g" \ -e "s*\([( ]\)#\([0-9]\+\)*\1[#\2](https://github.com/mikf/gallery-dl/issues/\2)*g" \
-e "s*^## \w\+\$*## ${NEWVERSION} - $(date +%Y-%m-%d)*" \ -e "s*^## \w\+\$*## ${NEWVERSION} - $(date +%Y-%m-%d)*" \
"${CHANGELOG}" "${CHANGELOG}"
mv "${CHANGELOG}" "${CHANGELOG}.orig"
# - remove all but the latest entries
sed -n \
-e '/^## /,/^$/ { /^$/q; p }' \
"${CHANGELOG}.orig" \
> "${CHANGELOG}"
} }
supportedsites() { supportedsites() {
@ -117,6 +137,7 @@ upload-git() {
cd "${ROOTDIR}" cd "${ROOTDIR}"
echo Pushing changes to github echo Pushing changes to github
mv "${CHANGELOG}.orig" "${CHANGELOG}" || true
git add "gallery_dl/version.py" "${README}" "${CHANGELOG}" git add "gallery_dl/version.py" "${README}" "${CHANGELOG}"
git commit -S -m "release version ${NEWVERSION}" git commit -S -m "release version ${NEWVERSION}"
git tag -s -m "version ${NEWVERSION}" "v${NEWVERSION}" git tag -s -m "version ${NEWVERSION}" "v${NEWVERSION}"

View File

@ -143,6 +143,7 @@ CATEGORY_MAP = {
"webmshare" : "webmshare", "webmshare" : "webmshare",
"webtoons" : "Webtoon", "webtoons" : "Webtoon",
"wikiart" : "WikiArt.org", "wikiart" : "WikiArt.org",
"wikigg" : "wiki.gg",
"wikimediacommons": "Wikimedia Commons", "wikimediacommons": "Wikimedia Commons",
"xbunkr" : "xBunkr", "xbunkr" : "xBunkr",
"xhamster" : "xHamster", "xhamster" : "xHamster",
@ -273,6 +274,10 @@ SUBCATEGORY_MAP = {
"sexcom": { "sexcom": {
"pins": "User Pins", "pins": "User Pins",
}, },
"skeb": {
"following" : "Followed Creators",
"following-users": "Followed Users",
},
"smugmug": { "smugmug": {
"path": "Images from Users and Folders", "path": "Images from Users and Folders",
}, },
@ -337,12 +342,12 @@ URL_MAP = {
_OAUTH = '<a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a>' _OAUTH = '<a href="https://github.com/mikf/gallery-dl#oauth">OAuth</a>'
_COOKIES = '<a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a>' _COOKIES = '<a href="https://github.com/mikf/gallery-dl#cookies">Cookies</a>'
_APIKEY_DB = \ _APIKEY_DB = ('<a href="https://gdl-org.github.io/docs/configuration.html'
'<a href="configuration.rst#extractorderpibooruapi-key">API Key</a>' '#extractor-derpibooru-api-key">API Key</a>')
_APIKEY_WH = \ _APIKEY_WH = ('<a href="https://gdl-org.github.io/docs/configuration.html'
'<a href="configuration.rst#extractorwallhavenapi-key">API Key</a>' '#extractor-wallhaven-api-key">API Key</a>')
_APIKEY_WY = \ _APIKEY_WY = ('<a href="https://gdl-org.github.io/docs/configuration.html'
'<a href="configuration.rst#extractorweasylapi-key">API Key</a>' '#extractor-weasyl-api-key">API Key</a>')
AUTH_MAP = { AUTH_MAP = {
"aibooru" : "Supported", "aibooru" : "Supported",
@ -350,11 +355,13 @@ AUTH_MAP = {
"atfbooru" : "Supported", "atfbooru" : "Supported",
"baraag" : _OAUTH, "baraag" : _OAUTH,
"bluesky" : "Supported", "bluesky" : "Supported",
"booruvar" : "Supported",
"coomerparty" : "Supported", "coomerparty" : "Supported",
"danbooru" : "Supported", "danbooru" : "Supported",
"derpibooru" : _APIKEY_DB, "derpibooru" : _APIKEY_DB,
"deviantart" : _OAUTH, "deviantart" : _OAUTH,
"e621" : "Supported", "e621" : "Supported",
"e6ai" : "Supported",
"e926" : "Supported", "e926" : "Supported",
"e-hentai" : "Supported", "e-hentai" : "Supported",
"exhentai" : "Supported", "exhentai" : "Supported",
@ -362,6 +369,7 @@ AUTH_MAP = {
"fantia" : _COOKIES, "fantia" : _COOKIES,
"flickr" : _OAUTH, "flickr" : _OAUTH,
"furaffinity" : _COOKIES, "furaffinity" : _COOKIES,
"furbooru" : "API Key",
"horne" : "Required", "horne" : "Required",
"idolcomplex" : "Supported", "idolcomplex" : "Supported",
"imgbb" : "Supported", "imgbb" : "Supported",
@ -382,7 +390,6 @@ AUTH_MAP = {
"reddit" : _OAUTH, "reddit" : _OAUTH,
"sankaku" : "Supported", "sankaku" : "Supported",
"seiga" : _COOKIES, "seiga" : _COOKIES,
"seisoparty" : "Supported",
"smugmug" : _OAUTH, "smugmug" : _OAUTH,
"subscribestar" : "Supported", "subscribestar" : "Supported",
"tapas" : "Supported", "tapas" : "Supported",

View File

@ -1,5 +1,5 @@
#!/usr/bin/env bash #!/usr/bin/env bash
# This is the maintainence launcher for the snap, make necessary runtime environment changes to make the snap work here. You may also insert security confinement/deprecation/obsoletion notice of the snap here. # This is the maintenance launcher for the snap, make necessary runtime environment changes to make the snap work here. You may also insert security confinement/deprecation/obsoletion notice of the snap here.
set \ set \
-o errexit \ -o errexit \

View File

@ -37,7 +37,7 @@ plugs:
# Network access # Network access
network: network:
# For network service for recieving OAuth callback tokens # For network service for receiving OAuth callback tokens
network-bind: network-bind:
# Configuration access # Configuration access

View File

@ -73,7 +73,7 @@ __tests__ = (
"#category": ("", "8chan", "board"), "#category": ("", "8chan", "board"),
"#class" : _8chan._8chanBoardExtractor, "#class" : _8chan._8chanBoardExtractor,
"#pattern" : _8chan._8chanThreadExtractor.pattern, "#pattern" : _8chan._8chanThreadExtractor.pattern,
"#count" : 27, "#count" : range(24, 28),
}, },
{ {

View File

@ -14,4 +14,12 @@ __tests__ = (
"#class" : wikimedia.WikimediaArticleExtractor, "#class" : wikimedia.WikimediaArticleExtractor,
}, },
{
"#url" : "https://azurlane.koumakan.jp/wiki/Louisville/Gallery",
"#comment" : "entries with missing 'imageinfo' (#5384)",
"#category": ("wikimedia", "azurlanewiki", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"#count" : "> 10",
},
) )

View File

@ -12,7 +12,7 @@ __tests__ = (
"#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html", "#url" : "https://julianbphotography.blogspot.com/2010/12/moon-rise.html",
"#category": ("blogger", "blogspot", "post"), "#category": ("blogger", "blogspot", "post"),
"#class" : blogger.BloggerPostExtractor, "#class" : blogger.BloggerPostExtractor,
"#urls" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", "#urls" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjH9WkPvLJq2moxKtyt3ieJZWSDFQwOi3PHRdlHVHEQHRwy-d86Jg6HWSMhxaa6EgvlXq-zDMmKM4kIPn27eJ9Hepk2X9e9HQhqwMfrT8RYTnFe65uexw7KSk5FdWHxRVp5crz3p_qph3Bj/s0/Icy-Moonrise---For-Web.jpg",
"blog": { "blog": {
"date" : "dt:2010-11-21 18:19:42", "date" : "dt:2010-11-21 18:19:42",
@ -43,7 +43,7 @@ __tests__ = (
"extension": "jpg", "extension": "jpg",
"filename" : "Icy-Moonrise---For-Web", "filename" : "Icy-Moonrise---For-Web",
"num" : 1, "num" : 1,
"url" : "https://3.bp.blogspot.com/-zlJddJtJOUo/Tt4WooTPNtI/AAAAAAAABG8/dGT2cGp2E7Y/s0/Icy-Moonrise---For-Web.jpg", "url" : "https://blogger.googleusercontent.com/img/b/R29vZ2xl/AVvXsEjH9WkPvLJq2moxKtyt3ieJZWSDFQwOi3PHRdlHVHEQHRwy-d86Jg6HWSMhxaa6EgvlXq-zDMmKM4kIPn27eJ9Hepk2X9e9HQhqwMfrT8RYTnFe65uexw7KSk5FdWHxRVp5crz3p_qph3Bj/s0/Icy-Moonrise---For-Web.jpg",
}, },
{ {
@ -59,7 +59,7 @@ __tests__ = (
"#comment" : "new image domain (#2204)", "#comment" : "new image domain (#2204)",
"#category": ("blogger", "blogspot", "post"), "#category": ("blogger", "blogspot", "post"),
"#class" : blogger.BloggerPostExtractor, "#class" : blogger.BloggerPostExtractor,
"#pattern" : "https://blogger.googleusercontent.com/img/a/.+=s0$", "#pattern" : r"https://blogger\.googleusercontent\.com/img/.+=s0$",
"#count" : 8, "#count" : 8,
}, },
@ -67,7 +67,7 @@ __tests__ = (
"#url" : "https://julianbphotography.blogspot.com/", "#url" : "https://julianbphotography.blogspot.com/",
"#category": ("blogger", "blogspot", "blog"), "#category": ("blogger", "blogspot", "blog"),
"#class" : blogger.BloggerBlogExtractor, "#class" : blogger.BloggerBlogExtractor,
"#pattern" : r"https://\d\.bp\.blogspot\.com/.*/s0/[^.]+\.jpg", "#pattern" : r"https://blogger\.googleusercontent\.com/img/.+/s0/",
"#range" : "1-25", "#range" : "1-25",
"#count" : 25, "#count" : 25,
}, },

View File

@ -133,6 +133,7 @@ __tests__ = (
"filename" : "bafkreidypzoaybmfj5h7pnpiyct6ng5yae6ydp4czrm72ocg7ev6vbirri", "filename" : "bafkreidypzoaybmfj5h7pnpiyct6ng5yae6ydp4czrm72ocg7ev6vbirri",
"height" : 630, "height" : 630,
"indexedAt" : "2023-12-22T18:58:32.715Z", "indexedAt" : "2023-12-22T18:58:32.715Z",
"instance" : "bsky.app",
"labels" : [], "labels" : [],
"likeCount" : int, "likeCount" : int,
"num" : 1, "num" : 1,
@ -153,7 +154,7 @@ __tests__ = (
"followersCount": int, "followersCount": int,
"followsCount" : int, "followsCount" : int,
"handle" : "bsky.app", "handle" : "bsky.app",
"indexedAt" : "2023-12-22T18:54:12.339Z", "indexedAt" : "2024-01-20T05:04:41.904Z",
"labels" : [], "labels" : [],
"postsCount" : int, "postsCount" : int,
}, },

View File

@ -13,13 +13,12 @@ __tests__ = (
"#category": ("lolisafe", "bunkr", "album"), "#category": ("lolisafe", "bunkr", "album"),
"#class" : bunkr.BunkrAlbumExtractor, "#class" : bunkr.BunkrAlbumExtractor,
"#urls" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", "#urls" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png",
"#sha1_content": "f38b54b17cd7462e687b58d83f00fca88b1b105a", "#sha1_content": "961b25d85b5f5bd18cbe3e847ac55925f14d0286",
"album_id" : "Lktg9Keq", "album_id" : "Lktg9Keq",
"album_name" : "test テスト \"&>", "album_name" : "test テスト \"&>",
"album_size" : "182 B", "album_size" : "182 B",
"count" : 1, "count" : 1,
"description": "",
"extension" : "png", "extension" : "png",
"file" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png", "file" : "https://i-burger.bunkr.ru/test-テスト-\"&>-QjgneIQv.png",
"filename" : "test-テスト-\"&>-QjgneIQv", "filename" : "test-テスト-\"&>-QjgneIQv",
@ -43,7 +42,6 @@ __tests__ = (
"album_name" : "test2", "album_name" : "test2",
"album_size" : "561.6 KB", "album_size" : "561.6 KB",
"count" : 2, "count" : 2,
"description": "",
"filename" : r"re:video-gLn1hgpw|image-sZrQUeOx", "filename" : r"re:video-gLn1hgpw|image-sZrQUeOx",
"id" : r"re:gLn1hgpw|sZrQUeOx", "id" : r"re:gLn1hgpw|sZrQUeOx",
"name" : r"re:video|image", "name" : r"re:video|image",

View File

@ -15,12 +15,32 @@ __tests__ = (
"#sha1_url": "e7d624aded15a069194e38dc731ec23217a422fb", "#sha1_url": "e7d624aded15a069194e38dc731ec23217a422fb",
}, },
{
"#url" : "https://desuarchive.org/a",
"#category": ("foolfuuka", "desuarchive", "board"),
"#class" : foolfuuka.FoolfuukaBoardExtractor,
},
{ {
"#url" : "https://desuarchive.org/a/", "#url" : "https://desuarchive.org/a/",
"#category": ("foolfuuka", "desuarchive", "board"), "#category": ("foolfuuka", "desuarchive", "board"),
"#class" : foolfuuka.FoolfuukaBoardExtractor, "#class" : foolfuuka.FoolfuukaBoardExtractor,
}, },
{
"#url" : "https://desuarchive.org/a/2",
"#category": ("foolfuuka", "desuarchive", "board"),
"#class" : foolfuuka.FoolfuukaBoardExtractor,
},
{
"#url" : "https://desuarchive.org/a/page/2",
"#category": ("foolfuuka", "desuarchive", "board"),
"#class" : foolfuuka.FoolfuukaBoardExtractor,
"#pattern" : foolfuuka.FoolfuukaThreadExtractor.pattern,
"#count" : 10,
},
{ {
"#url" : "https://desuarchive.org/_/search/text/test/", "#url" : "https://desuarchive.org/_/search/text/test/",
"#category": ("foolfuuka", "desuarchive", "search"), "#category": ("foolfuuka", "desuarchive", "search"),

View File

@ -252,6 +252,14 @@ __tests__ = (
), ),
}, },
{
"#url" : "https://deviantart.com/h3813067/avatar",
"#comment" : "default avatar (#5276)",
"#category": ("", "deviantart", "avatar"),
"#class" : deviantart.DeviantartAvatarExtractor,
"#count" : 0,
},
{ {
"#url" : "https://deviantart.com/gdldev/banner", "#url" : "https://deviantart.com/gdldev/banner",
"#category": ("", "deviantart", "background"), "#category": ("", "deviantart", "background"),
@ -300,7 +308,7 @@ __tests__ = (
"target" : dict, "target" : dict,
"thumbs" : list, "thumbs" : list,
"title" : "Banner", "title" : "Banner",
"url" : "https://sta.sh/0198jippkeys", "url" : "https://www.deviantart.com/stash/0198jippkeys",
"username" : "gdldev", "username" : "gdldev",
}, },
@ -352,13 +360,38 @@ __tests__ = (
"#class" : deviantart.DeviantartFolderExtractor, "#class" : deviantart.DeviantartFolderExtractor,
}, },
{
"#url" : "https://www.deviantart.com/stash/022c83odnaxc",
"#category": ("", "deviantart", "stash"),
"#class" : deviantart.DeviantartStashExtractor,
"#pattern" : r"https://wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/940f2d05-c5eb-4917-8192-7eb6a2d508c6/dcvdmbc-e506cdcf-3208-4c20-85ab-0bfa8a7bcb16.png\?token=ey.+",
"#count" : 1,
"#sha1_content": "057eb2f2861f6c8a96876b13cca1a4b7a408c11f",
"content": {
"filename": "01_by_justatest235723_dcvdmbc.png",
"filesize": 380,
"width" : 128,
"height" : 128,
"src" : r"re:https://wixmp-ed30a86b8c4ca887773594c2.wixmp.com/f/940f2d05-c5eb-4917-8192-7eb6a2d508c6/dcvdmbc-e506cdcf-3208-4c20-85ab-0bfa8a7bcb16.png\?token=ey.+",
},
"da_category" : "Uncategorized",
"date" : "dt:2018-12-26 14:49:27",
"deviationid" : "A4A6AD52-8857-46EE-ABFE-86D49D4FF9D0",
"download_filesize": 380,
"extension" : "png",
"filename" : "01_by_justatest235723-dcvdmbc",
"index" : 778297656,
"index_base36" : "cvdmbc",
"published_time": 1545835767,
"title" : "01",
"url" : "https://www.deviantart.com/stash/022c83odnaxc",
},
{ {
"#url" : "https://sta.sh/022c83odnaxc", "#url" : "https://sta.sh/022c83odnaxc",
"#category": ("", "deviantart", "stash"), "#category": ("", "deviantart", "stash"),
"#class" : deviantart.DeviantartStashExtractor, "#class" : deviantart.DeviantartStashExtractor,
"#pattern" : r"https://wixmp-[^.]+\.wixmp\.com/f/.+/.+\.png\?token=.+",
"#count" : 1,
"#sha1_content": "057eb2f2861f6c8a96876b13cca1a4b7a408c11f",
}, },
{ {
@ -556,7 +589,7 @@ __tests__ = (
"index" : int, "index" : int,
"index_base36": r"re:^[0-9a-z]+$", "index_base36": r"re:^[0-9a-z]+$",
"url" : r"re:^https://sta.sh", "url" : r"re:^https://www.deviantart.com/stash/\w+",
}, },
{ {

View File

@ -83,6 +83,15 @@ __tests__ = (
"width" : 728, "width" : 728,
}, },
{
"#url" : "https://hearthstone.fandom.com/wiki/Flame_Juggler",
"#comment" : "empty 'metadata'",
"#category": ("wikimedia", "fandom-hearthstone", "article"),
"#class" : wikimedia.WikimediaArticleExtractor,
"metadata" : {},
},
{ {
"#url" : "https://projectsekai.fandom.com/wiki/Project_SEKAI_Wiki", "#url" : "https://projectsekai.fandom.com/wiki/Project_SEKAI_Wiki",
"#category": ("wikimedia", "fandom-projectsekai", "article"), "#category": ("wikimedia", "fandom-projectsekai", "article"),

View File

@ -121,6 +121,24 @@ __tests__ = (
"#class" : furaffinity.FuraffinityPostExtractor, "#class" : furaffinity.FuraffinityPostExtractor,
}, },
{
"#url" : "https://fxfuraffinity.net/view/21835115/",
"#category": ("", "furaffinity", "post"),
"#class" : furaffinity.FuraffinityPostExtractor,
},
{
"#url" : "https://xfuraffinity.net/view/21835115/",
"#category": ("", "furaffinity", "post"),
"#class" : furaffinity.FuraffinityPostExtractor,
},
{
"#url" : "https://fxraffinity.net/view/21835115/",
"#category": ("", "furaffinity", "post"),
"#class" : furaffinity.FuraffinityPostExtractor,
},
{ {
"#url" : "https://sfw.furaffinity.net/view/21835115/", "#url" : "https://sfw.furaffinity.net/view/21835115/",
"#category": ("", "furaffinity", "post"), "#category": ("", "furaffinity", "post"),

View File

@ -39,6 +39,22 @@ __tests__ = (
"#sha1_url": "845a61aa1f90fb4ced841e8b7e62098be2e967bf", "#sha1_url": "845a61aa1f90fb4ced841e8b7e62098be2e967bf",
}, },
{
"#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=id:>=67800+id:<=68000",
"#comment" : "meta tags (#5478)",
"#category": ("booru", "gelbooru", "tag"),
"#class" : gelbooru.GelbooruTagExtractor,
"#count" : 187,
},
{
"#url" : "https://gelbooru.com/index.php?page=post&s=list&tags=id:>=67800+id:<=68000+sort:id:asc",
"#comment" : "meta + sort tags (#5478)",
"#category": ("booru", "gelbooru", "tag"),
"#class" : gelbooru.GelbooruTagExtractor,
"#count" : 187,
},
{ {
"#url" : "https://gelbooru.com/index.php?page=pool&s=show&id=761", "#url" : "https://gelbooru.com/index.php?page=pool&s=show&id=761",
"#category": ("booru", "gelbooru", "pool"), "#category": ("booru", "gelbooru", "pool"),
@ -47,10 +63,30 @@ __tests__ = (
}, },
{ {
"#url" : "https://gelbooru.com/index.php?page=favorites&s=view&id=279415", "#url" : "https://gelbooru.com/index.php?page=favorites&s=view&id=1435674",
"#category": ("booru", "gelbooru", "favorite"), "#category": ("booru", "gelbooru", "favorite"),
"#class" : gelbooru.GelbooruFavoriteExtractor, "#class" : gelbooru.GelbooruFavoriteExtractor,
"#count" : 3, "#urls" : (
"https://img3.gelbooru.com/images/5d/30/5d30fc056ed8668616b3c440df9bac89.jpg",
"https://img3.gelbooru.com/images/4c/2d/4c2da867ed643acdadd8105177dcdaf0.png",
"https://img3.gelbooru.com/images/c8/26/c826f3cb90d9aaca8d0632a96bf4abe8.jpg",
"https://img3.gelbooru.com/images/c1/fe/c1fe59c0bc8ce955dd353544b1015d0c.jpg",
"https://img3.gelbooru.com/images/e6/6d/e66d8883c184f5d3b2591dfcdf0d007c.jpg",
),
},
{
"#url" : "https://gelbooru.com/index.php?page=favorites&s=view&id=1435674",
"#category": ("booru", "gelbooru", "favorite"),
"#class" : gelbooru.GelbooruFavoriteExtractor,
"#options" : {"order-posts": "reverse"},
"#urls" : (
"https://img3.gelbooru.com/images/e6/6d/e66d8883c184f5d3b2591dfcdf0d007c.jpg",
"https://img3.gelbooru.com/images/c1/fe/c1fe59c0bc8ce955dd353544b1015d0c.jpg",
"https://img3.gelbooru.com/images/c8/26/c826f3cb90d9aaca8d0632a96bf4abe8.jpg",
"https://img3.gelbooru.com/images/4c/2d/4c2da867ed643acdadd8105177dcdaf0.png",
"https://img3.gelbooru.com/images/5d/30/5d30fc056ed8668616b3c440df9bac89.jpg",
),
}, },
{ {

View File

@ -29,10 +29,11 @@ __tests__ = (
}, },
{ {
"#url" : "https://www.hentai-foundry.com/pictures/user/Evulchibi/scraps", "#url" : "https://www.hentai-foundry.com/pictures/user/Ethevian/scraps",
"#category": ("", "hentaifoundry", "scraps"), "#category": ("", "hentaifoundry", "scraps"),
"#class" : hentaifoundry.HentaifoundryScrapsExtractor, "#class" : hentaifoundry.HentaifoundryScrapsExtractor,
"#sha1_url": "7cd9c6ec6258c4ab8c44991f7731be82337492a7", "#pattern" : r"https://pictures\.hentai-foundry\.com/e/Ethevian/.+",
"#count" : ">= 10",
}, },
{ {

View File

@ -9,7 +9,7 @@ from gallery_dl.extractor import hiperdex
__tests__ = ( __tests__ = (
{ {
"#url" : "https://hiperdex.com/manga/domestic-na-kanojo/154-5/", "#url" : "https://hiperdex.com/mangas/domestic-na-kanojo/154-5/",
"#category": ("", "hiperdex", "chapter"), "#category": ("", "hiperdex", "chapter"),
"#class" : hiperdex.HiperdexChapterExtractor, "#class" : hiperdex.HiperdexChapterExtractor,
"#pattern" : r"https://(1st)?hiperdex\d?.(com|net|info)/wp-content/uploads/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp", "#pattern" : r"https://(1st)?hiperdex\d?.(com|net|info)/wp-content/uploads/WP-manga/data/manga_\w+/[0-9a-f]{32}/\d+\.webp",
@ -27,6 +27,12 @@ __tests__ = (
"type" : "Manga", "type" : "Manga",
}, },
{
"#url" : "https://hiperdex.com/manga/domestic-na-kanojo/154-5/",
"#category": ("", "hiperdex", "chapter"),
"#class" : hiperdex.HiperdexChapterExtractor,
},
{ {
"#url" : "https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/", "#url" : "https://1sthiperdex.com/manga/domestic-na-kanojo/154-5/",
"#category": ("", "hiperdex", "chapter"), "#category": ("", "hiperdex", "chapter"),

View File

@ -5,6 +5,7 @@
# published by the Free Software Foundation. # published by the Free Software Foundation.
from gallery_dl.extractor import hitomi from gallery_dl.extractor import hitomi
from gallery_dl import exception
__tests__ = ( __tests__ = (
@ -47,9 +48,7 @@ __tests__ = (
"#comment" : "gallery with 'broken' redirect", "#comment" : "gallery with 'broken' redirect",
"#category": ("", "hitomi", "gallery"), "#category": ("", "hitomi", "gallery"),
"#class" : hitomi.HitomiGalleryExtractor, "#class" : hitomi.HitomiGalleryExtractor,
"#options" : {"format": "original"}, "#exception": exception.NotFoundError,
"#pattern" : r"https://[a-c]b\.hitomi\.la/images/\d+/\d+/[0-9a-f]{64}\.jpg",
"#count" : 10,
}, },
{ {

View File

@ -42,7 +42,7 @@ __tests__ = (
}, },
{ {
"#url" : "https://idol.sankakucomplex.com/pools/show/145", "#url" : "https://idol.sankakucomplex.com/en/pools/e9PMwnwRBK3",
"#category": ("booru", "idolcomplex", "pool"), "#category": ("booru", "idolcomplex", "pool"),
"#class" : idolcomplex.IdolcomplexPoolExtractor, "#class" : idolcomplex.IdolcomplexPoolExtractor,
"#count" : 3, "#count" : 3,
@ -72,16 +72,16 @@ __tests__ = (
"file_url" : r"re:https://i[sv]\.sankakucomplex\.com/data/50/9e/509eccbba54a43cea6b275a65b93c51d\.jpg\?", "file_url" : r"re:https://i[sv]\.sankakucomplex\.com/data/50/9e/509eccbba54a43cea6b275a65b93c51d\.jpg\?",
"filename" : "509eccbba54a43cea6b275a65b93c51d", "filename" : "509eccbba54a43cea6b275a65b93c51d",
"height" : 683, "height" : 683,
"id" : 694215, "id" : "vkr36qdOaZ4", # legacy ID: 694215
"id_alnum" : "vkr36qdOaZ4",
"md5" : "509eccbba54a43cea6b275a65b93c51d", "md5" : "509eccbba54a43cea6b275a65b93c51d",
"rating" : "g", "rating" : "g",
"tags" : "lyumos the_witcher shani_(the_witcher) 1girl green_eyes non-asian redhead waistcoat wreath cosplay 3:2_aspect_ratio", "tags" : "lyumos the_witcher shani_(the_witcher) 1girl green_eyes non-asian redhead waistcoat wreath cosplay 3:2_aspect_ratio",
"tags_character": "shani_(the_witcher)", "tags_character": "shani_(the_witcher)",
"tags_copyright": "the_witcher", "tags_copyright": "the_witcher",
"tags_general" : "1girl green_eyes non-asian redhead waistcoat wreath", "tags_general" : "1girl green_eyes non-asian redhead waistcoat wreath",
"tags_genre" : "cosplay",
"tags_idol" : "lyumos", "tags_idol" : "lyumos",
"tags_medium" : "cosplay 3:2_aspect_ratio", "tags_medium" : "3:2_aspect_ratio",
"vote_average" : range(4, 5), "vote_average" : range(4, 5),
"vote_count" : range(25, 40), "vote_count" : range(25, 40),
"width" : 1024, "width" : 1024,
@ -111,8 +111,7 @@ __tests__ = (
"#class" : idolcomplex.IdolcomplexPostExtractor, "#class" : idolcomplex.IdolcomplexPostExtractor,
"#sha1_content": "694ec2491240787d75bf5d0c75d0082b53a85afd", "#sha1_content": "694ec2491240787d75bf5d0c75d0082b53a85afd",
"id" : 694215, "id" : "vkr36qdOaZ4", # legacy ID: 694215
"id_alnum" : "vkr36qdOaZ4",
"tags_character": "shani_(the_witcher)", "tags_character": "shani_(the_witcher)",
"tags_copyright": "the_witcher", "tags_copyright": "the_witcher",
"tags_idol" : str, "tags_idol" : str,

View File

@ -120,11 +120,25 @@ __tests__ = (
"#sha1_url": "37822523e6e4a56feb9dea35653760c86b44ff89", "#sha1_url": "37822523e6e4a56feb9dea35653760c86b44ff89",
}, },
{
"#url" : "https://www.imagefap.com/organizer/613950/Grace-Stout",
"#category": ("", "imagefap", "folder"),
"#class" : imagefap.ImagefapFolderExtractor,
"#pattern" : imagefap.ImagefapGalleryExtractor.pattern,
"#count" : 31,
"title": r"re:Grace Stout .+",
},
{ {
"#url" : "https://www.imagefap.com/usergallery.php?userid=1981976&folderid=409758", "#url" : "https://www.imagefap.com/usergallery.php?userid=1981976&folderid=409758",
"#category": ("", "imagefap", "folder"), "#category": ("", "imagefap", "folder"),
"#class" : imagefap.ImagefapFolderExtractor, "#class" : imagefap.ImagefapFolderExtractor,
"#sha1_url": "37822523e6e4a56feb9dea35653760c86b44ff89", "#urls" : "https://www.imagefap.com/gallery/7876223",
"folder" : "Softcore",
"gallery_id": "7876223",
"title" : "Kelsi Monroe in lingerie",
}, },
{ {
@ -140,6 +154,8 @@ __tests__ = (
"#class" : imagefap.ImagefapFolderExtractor, "#class" : imagefap.ImagefapFolderExtractor,
"#pattern" : imagefap.ImagefapGalleryExtractor.pattern, "#pattern" : imagefap.ImagefapGalleryExtractor.pattern,
"#range" : "1-40", "#range" : "1-40",
"folder": "Uncategorized",
}, },
{ {

View File

@ -89,11 +89,10 @@ __tests__ = (
}, },
{ {
"#url" : "https://kemono.party/gumroad/user/trylsc/post/IURjT", "#url" : "https://kemono.su/gumroad/user/3101696181060/post/tOWyf",
"#comment" : "kemono.party -> data.kemono.party",
"#category": ("", "kemonoparty", "gumroad"), "#category": ("", "kemonoparty", "gumroad"),
"#class" : kemonoparty.KemonopartyPostExtractor, "#class" : kemonoparty.KemonopartyPostExtractor,
"#pattern" : r"https://kemono\.party/data/(a4/7b/a47bfe938d8c1682eef06e885927484cd8df1b.+\.jpg|c6/04/c6048f5067fd9dbfa7a8be565ac194efdfb6e4.+\.zip)", "#urls" : "https://kemono.su/data/6f/13/6f1394b19516396ea520254350662c254bbea30c1e111fd4b0f042c61c426d07.zip",
}, },
{ {
@ -136,6 +135,19 @@ __tests__ = (
}], }],
}, },
{
"#url" : "https://kemono.su/patreon/user/3161935/post/68231671",
"#comment" : "announcements",
"#category": ("", "kemonoparty", "patreon"),
"#class" : kemonoparty.KemonopartyPostExtractor,
"#options" : {"announcements": True},
"announcements": [{
"body": "<div><strong>Thank you so much for the support!</strong><strong><br></strong>This Patreon is more of a tip jar for supporting what I make. I have to clarify that there are <strong>no exclusive Patreon animations</strong> because all are released for the public. You will get earlier access to WIPs. Direct downloads to my works are also available for $5 and $10 Tiers.</div>",
"date": "2023-02",
}],
},
{ {
"#url" : "https://kemono.su/patreon/user/19623797/post/29035449", "#url" : "https://kemono.su/patreon/user/19623797/post/29035449",
"#comment" : "invalid file (#3510)", "#comment" : "invalid file (#3510)",
@ -195,6 +207,7 @@ __tests__ = (
"hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", "hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86",
"revision_id" : 142470, "revision_id" : 142470,
"revision_index": 2, "revision_index": 2,
"revision_count": 9,
"revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40",
}, },
@ -210,6 +223,7 @@ __tests__ = (
"hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86", "hash" : "88521f71822dfa2f42df3beba319ea4fceda2a2d6dc59da0276a75238f743f86",
"revision_id" : 0, "revision_id" : 0,
"revision_index": 1, "revision_index": 1,
"revision_count": 1,
"revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40", "revision_hash" : "e0e93281495e151b11636c156e52bfe9234c2a40",
}, },
@ -224,6 +238,7 @@ __tests__ = (
"revision_id": range(134996, 3052965), "revision_id": range(134996, 3052965),
"revision_index": range(1, 9), "revision_index": range(1, 9),
"revision_count": 9,
"revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40", "revision_hash": "e0e93281495e151b11636c156e52bfe9234c2a40",
}, },
@ -246,6 +261,16 @@ __tests__ = (
"published": "2022-07-29T21:12:11.483000", "published": "2022-07-29T21:12:11.483000",
}, },
{
"#url" : "https://kemono.su/gumroad/user/3267960360326/post/jwwag",
"#comment" : "empty 'file' with no 'path' (#5368)",
"#category": ("", "kemonoparty", "gumroad"),
"#class" : kemonoparty.KemonopartyPostExtractor,
"#count" : 8,
"type" : "attachment",
},
{ {
"#url" : "https://kemono.su/discord/server/488668827274444803#608504710906904576", "#url" : "https://kemono.su/discord/server/488668827274444803#608504710906904576",
"#category": ("", "kemonoparty", "discord"), "#category": ("", "kemonoparty", "discord"),
@ -340,8 +365,24 @@ __tests__ = (
"#class" : kemonoparty.KemonopartyFavoriteExtractor, "#class" : kemonoparty.KemonopartyFavoriteExtractor,
"#pattern" : kemonoparty.KemonopartyUserExtractor.pattern, "#pattern" : kemonoparty.KemonopartyUserExtractor.pattern,
"#auth" : True, "#auth" : True,
"#count" : 3, "#urls" : (
"#sha1_url": "902c656c8002a3257ef9e255cb69bca1937373d4", "https://kemono.su/patreon/user/881792",
"https://kemono.su/fanbox/user/6993449",
"https://kemono.su/subscribestar/user/alcorart",
),
},
{
"#url" : "https://kemono.su/favorites?type=artist&sort=faved_seq&order=asc",
"#category": ("", "kemonoparty", "favorite"),
"#class" : kemonoparty.KemonopartyFavoriteExtractor,
"#pattern" : kemonoparty.KemonopartyUserExtractor.pattern,
"#auth" : True,
"#urls" : (
"https://kemono.su/fanbox/user/6993449",
"https://kemono.su/patreon/user/881792",
"https://kemono.su/subscribestar/user/alcorart",
),
}, },
{ {
@ -350,8 +391,24 @@ __tests__ = (
"#class" : kemonoparty.KemonopartyFavoriteExtractor, "#class" : kemonoparty.KemonopartyFavoriteExtractor,
"#pattern" : kemonoparty.KemonopartyPostExtractor.pattern, "#pattern" : kemonoparty.KemonopartyPostExtractor.pattern,
"#auth" : True, "#auth" : True,
"#count" : 3, "#urls" : (
"#sha1_url": "4be8e84cb384a907a8e7997baaf6287b451783b5", "https://kemono.su/subscribestar/user/alcorart/post/184329",
"https://kemono.su/fanbox/user/6993449/post/23913",
"https://kemono.su/patreon/user/881792/post/4769638",
),
},
{
"#url" : "https://kemono.su/favorites?type=post&sort=published&order=asc",
"#category": ("", "kemonoparty", "favorite"),
"#class" : kemonoparty.KemonopartyFavoriteExtractor,
"#pattern" : kemonoparty.KemonopartyPostExtractor.pattern,
"#auth" : True,
"#urls" : (
"https://kemono.su/patreon/user/881792/post/4769638",
"https://kemono.su/fanbox/user/6993449/post/23913",
"https://kemono.su/subscribestar/user/alcorart/post/184329",
),
}, },
) )

View File

@ -32,7 +32,7 @@ __tests__ = (
"#url" : "https://lensdump.com/i/tyoAyM", "#url" : "https://lensdump.com/i/tyoAyM",
"#category": ("", "lensdump", "image"), "#category": ("", "lensdump", "image"),
"#class" : lensdump.LensdumpImageExtractor, "#class" : lensdump.LensdumpImageExtractor,
"#pattern" : r"https://c\.l3n\.co/i/tyoAyM\.webp", "#urls" : "https://c.l3n.co/i/tyoAyM.webp",
"#sha1_content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46", "#sha1_content": "1aa749ed2c0cf679ec8e1df60068edaf3875de46",
"date" : "dt:2022-08-01 08:24:28", "date" : "dt:2022-08-01 08:24:28",
@ -45,4 +45,32 @@ __tests__ = (
"width" : 620, "width" : 620,
}, },
{
"#url" : "https://c.l3n.co/i/tyoAyM.webp",
"#category": ("", "lensdump", "image"),
"#class" : lensdump.LensdumpImageExtractor,
"#urls" : "https://c.l3n.co/i/tyoAyM.webp",
"date" : "dt:2022-08-01 08:24:28",
"extension": "webp",
"filename" : "tyoAyM",
"height" : 400,
"id" : "tyoAyM",
"title" : "MYOBI clovis bookcaseset",
"url" : "https://c.l3n.co/i/tyoAyM.webp",
"width" : 620,
},
{
"#url" : "https://i.lensdump.com/i/tyoAyM",
"#category": ("", "lensdump", "image"),
"#class" : lensdump.LensdumpImageExtractor,
},
{
"#url" : "https://i3.lensdump.com/i/tyoAyM",
"#category": ("", "lensdump", "image"),
"#class" : lensdump.LensdumpImageExtractor,
},
) )

View File

@ -18,4 +18,15 @@ __tests__ = (
"instance_remote": None, "instance_remote": None,
}, },
{
"#url" : "mastodon:https://wanderingwires.net/@quarc/9qppkxzyd1ee3i9p",
"#comment" : "null moved account",
"#category": ("mastodon", "wanderingwires.net", "status"),
"#class" : mastodon.MastodonStatusExtractor,
"#urls" : "https://s3.wanderingwires.net/null/4377e826-72ab-4659-885c-fa12945eb207.png",
"instance": "wanderingwires.net",
"instance_remote": None,
},
) )

View File

@ -74,6 +74,33 @@ __tests__ = (
"#url" : "https://mastodon.social/bookmarks", "#url" : "https://mastodon.social/bookmarks",
"#category": ("mastodon", "mastodon.social", "bookmark"), "#category": ("mastodon", "mastodon.social", "bookmark"),
"#class" : mastodon.MastodonBookmarkExtractor, "#class" : mastodon.MastodonBookmarkExtractor,
"#auth" : True,
"#urls" : "https://files.mastodon.social/media_attachments/files/111/331/603/082/304/823/original/e12cde371c88c1b0.png",
},
{
"#url" : "https://mastodon.social/favourites",
"#category": ("mastodon", "mastodon.social", "favorite"),
"#class" : mastodon.MastodonFavoriteExtractor,
"#auth" : True,
"#urls" : "https://files.mastodon.social/media_attachments/files/111/331/603/082/304/823/original/e12cde371c88c1b0.png",
},
{
"#url" : "https://mastodon.social/lists/92653",
"#category": ("mastodon", "mastodon.social", "list"),
"#class" : mastodon.MastodonListExtractor,
"#auth" : True,
"#pattern" : r"https://files\.mastodon\.social/media_attachments/files/(\d+/){3,}original/\w+",
"#range" : "1-10",
},
{
"#url" : "https://mastodon.social/tags/mastodon",
"#category": ("mastodon", "mastodon.social", "hashtag"),
"#class" : mastodon.MastodonHashtagExtractor,
"#pattern" : r"https://files\.mastodon\.social/media_attachments/files/(\d+/){3,}original/\w+",
"#range" : "1-10",
}, },
{ {
@ -82,9 +109,9 @@ __tests__ = (
"#class" : mastodon.MastodonFollowingExtractor, "#class" : mastodon.MastodonFollowingExtractor,
"#extractor": False, "#extractor": False,
"#urls" : ( "#urls" : (
"https://mastodon.ie/@RustyBertrand",
"https://ravenation.club/@soundwarrior20", "https://ravenation.club/@soundwarrior20",
"https://mastodon.social/@0x4f", "https://mastodon.social/@0x4f",
"https://mastodon.social/@RustyBertrand",
"https://mastodon.social/@christianselig", "https://mastodon.social/@christianselig",
"https://saturation.social/@clive", "https://saturation.social/@clive",
"https://mastodon.social/@sjvn", "https://mastodon.social/@sjvn",
@ -137,4 +164,36 @@ __tests__ = (
"num" : int, "num" : int,
}, },
{
"#url" : "https://mastodon.social/@technewsbot@assortedflotsam.com/112360601113258881",
"#comment" : "card image",
"#category": ("mastodon", "mastodon.social", "status"),
"#class" : mastodon.MastodonStatusExtractor,
"#options" : {"cards": True},
"#urls" : "https://files.mastodon.social/cache/preview_cards/images/095/900/335/original/83f0b4a793c84123.jpg",
"media": {
"author_name" : "Tom Warren",
"author_url" : "https://www.theverge.com/authors/tom-warren",
"blurhash" : "UHBDWMCjVGM0k,XjnPM#0h+vkpb^RkjYSh$*",
"description" : "Microsofts big Xbox games showcase will take place on June 9th. It will include more games than last year and a special Call of Duty Direct will follow.",
"embed_url" : "",
"height" : 628,
"html" : "",
"id" : "card95900335",
"image" : "https://files.mastodon.social/cache/preview_cards/images/095/900/335/original/83f0b4a793c84123.jpg",
"image_description": "The Xbox showcase illustration",
"language" : "en",
"provider_name": "The Verge",
"provider_url": "",
"published_at": "2024-04-30T14:15:30.341Z",
"title" : "The Xbox games showcase airs June 9th, followed by a Call of Duty Direct",
"type" : "link",
"url" : "https://files.mastodon.social/cache/preview_cards/images/095/900/335/original/83f0b4a793c84123.jpg",
"weburl" : "https://www.theverge.com/2024/4/30/24145262/xbox-games-showcase-summer-2024-call-of-duty-direct",
"width" : 1200,
},
},
) )

View File

@ -21,7 +21,7 @@ __tests__ = (
"#url" : "https://misskey.design/@blooddj@pawoo.net", "#url" : "https://misskey.design/@blooddj@pawoo.net",
"#category": ("misskey", "misskey.design", "user"), "#category": ("misskey", "misskey.design", "user"),
"#class" : misskey.MisskeyUserExtractor, "#class" : misskey.MisskeyUserExtractor,
"#count" : 7, "#count" : "> 30",
}, },
{ {

View File

@ -12,7 +12,7 @@ __tests__ = (
"#url" : "https://myhentaigallery.com/g/16247", "#url" : "https://myhentaigallery.com/g/16247",
"#category": ("", "myhentaigallery", "gallery"), "#category": ("", "myhentaigallery", "gallery"),
"#class" : myhentaigallery.MyhentaigalleryGalleryExtractor, "#class" : myhentaigallery.MyhentaigalleryGalleryExtractor,
"#pattern" : r"https://images\.myhentaicomics\.com/m\w\w/images/[^/]+/original/\d+\.jpg", "#pattern" : r"https://(cdn|images)\.myhentaicomics\.com/m\w\w/images/[^/]+/original/\d+\.jpg",
"artist" : list, "artist" : list,
"count" : 11, "count" : 11,

View File

@ -24,6 +24,39 @@ __tests__ = (
"#sha1_metadata": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e", "#sha1_metadata": "a6e23d19afbee86b37d6e7ad934650c379d2cb1e",
}, },
{
"#url" : "https://blog.naver.com/PostView.nhn?blogId=rlfqjxm0&logNo=70161391809",
"#comment" : "filenames in EUC-KR encoding (#5126)",
"#category": ("", "naver", "post"),
"#class" : naver.NaverPostExtractor,
"#urls": (
"https://blogfiles.pstatic.net/20130305_23/ping9303_1362411028002Dpz9z_PNG/1_사본.png",
"https://blogfiles.pstatic.net/20130305_46/rlfqjxm0_1362473322580x33zi_PNG/오마갓합작.png",
),
"blog": {
"id" : "rlfqjxm0",
"num" : 43030507,
"user": "에나",
},
"post": {
"date" : "dt:2013-03-05 17:48:00",
"description": " ◈ PROMOTER :핑수 ˚ 아담 EDITOR핑수 넵:이크:핑수...",
"num" : 70161391809,
"title" : "[공유] { 합작} OH, MY GOD! ~ 아 또 무슨 종말을 한다 그래~",
},
"count" : 2,
"num" : range(1, 2),
"filename" : r"re:1_사본|오마갓합작",
"extension": "png",
},
{
"#url" : "https://blog.naver.com/PostView.naver?blogId=rlfqjxm0&logNo=221430673006",
"#category": ("", "naver", "post"),
"#class" : naver.NaverPostExtractor,
},
{ {
"#url" : "https://blog.naver.com/gukjung", "#url" : "https://blog.naver.com/gukjung",
"#category": ("", "naver", "blog"), "#category": ("", "naver", "blog"),
@ -42,4 +75,10 @@ __tests__ = (
"#count" : 12, "#count" : 12,
}, },
{
"#url" : "https://blog.naver.com/PostList.naver?blogId=gukjung",
"#category": ("", "naver", "blog"),
"#class" : naver.NaverBlogExtractor,
},
) )

View File

@ -109,7 +109,7 @@ __tests__ = (
"#category": ("", "naverwebtoon", "comic"), "#category": ("", "naverwebtoon", "comic"),
"#class" : naverwebtoon.NaverwebtoonComicExtractor, "#class" : naverwebtoon.NaverwebtoonComicExtractor,
"#pattern" : naverwebtoon.NaverwebtoonEpisodeExtractor.pattern, "#pattern" : naverwebtoon.NaverwebtoonEpisodeExtractor.pattern,
"#count" : 25, "#count" : 24,
}, },
{ {

View File

@ -15,11 +15,11 @@ __tests__ = (
}, },
{ {
"#url" : "https://www.omgmiamiswimwear.com/products/la-medusa-maxi-dress", "#url" : "https://www.omgmiamiswimwear.com/products/snatch-me-waist-belt",
"#category": ("shopify", "omgmiamiswimwear", "product"), "#category": ("shopify", "omgmiamiswimwear", "product"),
"#class" : shopify.ShopifyProductExtractor, "#class" : shopify.ShopifyProductExtractor,
"#pattern" : r"https://cdn\.shopify\.com/s/files/1/1819/6171/", "#pattern" : r"https://cdn\.shopify\.com/s/files/1/1819/6171/",
"#count" : 5, "#count" : 3,
}, },
) )

View File

@ -163,6 +163,14 @@ __tests__ = (
"#count" : ">= 10", "#count" : ">= 10",
}, },
{
"#url" : "https://www.pixiv.net/artworks/966412",
"#comment" : "limit_sanity_level_360.png (#4327, #5180)",
"#category": ("", "pixiv", "work"),
"#class" : pixiv.PixivWorkExtractor,
"#count" : 0,
},
{ {
"#url" : "https://www.pixiv.net/en/artworks/966412", "#url" : "https://www.pixiv.net/en/artworks/966412",
"#category": ("", "pixiv", "work"), "#category": ("", "pixiv", "work"),
@ -459,11 +467,14 @@ __tests__ = (
{ {
"#url" : "https://www.pixiv.net/novel/show.php?id=16422450", "#url" : "https://www.pixiv.net/novel/show.php?id=16422450",
"#comment" : "embeds", "#comment" : "embeds // covers (#5373)",
"#category": ("", "pixiv", "novel"), "#category": ("", "pixiv", "novel"),
"#class" : pixiv.PixivNovelExtractor, "#class" : pixiv.PixivNovelExtractor,
"#options" : {"embeds": True}, "#options" : {
"#count" : 3, "embeds": True,
"covers": True,
},
"#count" : 4,
}, },
{ {

View File

@ -62,9 +62,11 @@ __tests__ = (
"hardcore sex", "hardcore sex",
"babes 18 year", "babes 18 year",
], ],
"timestamp": "5:07",
"title" : "Intense sloppy blowjob of Danika Mori", "title" : "Intense sloppy blowjob of Danika Mori",
"url" : "https://el.phncdn.com/pics/gifs/043/726/891/43726891a.webm", "url" : "https://el.phncdn.com/pics/gifs/043/726/891/43726891a.webm",
"user" : "Danika Mori", "user" : "Danika Mori",
"viewkey" : "64367c8c78a4a",
}, },
{ {

Some files were not shown because too many files have changed in this diff Show More