diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index 1abff80c..043940b6 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -1,33 +1,47 @@ -name: docker +name: Docker Images on: workflow_dispatch: push: + branches: + - master tags: - v[0-9]+.[0-9]+.[0-9]+ + permissions: packages: write +concurrency: + group: docker + cancel-in-progress: false + jobs: - docker: + build: runs-on: ubuntu-latest + # on release commits, run only for tag event + if: ${{ ! startsWith( github.event.head_commit.message , 'release version ' ) || startsWith( github.ref , 'refs/tags/v' ) }} + steps: - uses: actions/checkout@v4 - # https://github.com/docker/setup-buildx-action - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - - # https://github.com/docker/login-action - - name: Login to GitHub Container Registry - uses: docker/login-action@v3 + - uses: docker/metadata-action@v5 + id: metadata with: - registry: ghcr.io - username: ${{ github.repository_owner }} - password: ${{ secrets.GHCR_TOKEN }} + images: | + mikf123/gallery-dl + ghcr.io/mikf/gallery-dl + tags: | + type=ref,event=tag + type=raw,value=dev + type=sha,format=long,prefix= + type=raw,priority=500,value={{date 'YYYY.MM.DD'}} + + - uses: docker/setup-qemu-action@v3 + + - uses: docker/setup-buildx-action@v3 - name: Login to DockerHub uses: docker/login-action@v3 @@ -35,23 +49,17 @@ jobs: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - # https://github.com/docker/metadata-action - - name: Generate Docker tags - uses: docker/metadata-action@v5 - id: metadata + - name: Login to GitHub Container Registry + uses: docker/login-action@v3 with: - images: | - mikf123/gallery-dl - ghcr.io/mikf/gallery-dl - tags: | - type=sha,format=long,prefix= - type=ref,event=tag + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_TOKEN }} - # https://github.com/docker/build-push-action - - name: Build image - uses: docker/build-push-action@v5 + - uses: docker/build-push-action@v5 with: + context: . push: true tags: ${{ steps.metadata.outputs.tags }} labels: ${{ steps.metadata.outputs.labels }} - platforms: linux/amd64 + platforms: linux/amd64,linux/arm64 diff --git a/.github/workflows/executables.yml b/.github/workflows/executables.yml index 9d49e875..7a303ce2 100644 --- a/.github/workflows/executables.yml +++ b/.github/workflows/executables.yml @@ -1,10 +1,15 @@ -name: executables +name: Executables on: workflow_dispatch: push: branches: - master + tags-ignore: + - "*" + +env: + DATE_FORMAT: "%Y.%m.%d" jobs: build: @@ -31,19 +36,58 @@ jobs: - uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} ${{ matrix.architecture }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} architecture: ${{ matrix.architecture }} + - name: Date + run: echo "DATE=$(date '+${{ env.DATE_FORMAT }}')" >> "$GITHUB_ENV" + + - name: Update Version + # use Python since its behavior is consistent across operating systems + shell: python + run: | + import re + path = "./gallery_dl/version.py" + with open(path) as fp: + content = fp.read() + content = re.sub( + r'\b(__version__ = "[^"]+)', + r"\1:${{ env.DATE }}", + content) + with open(path, "w") as fp: + fp.write(content) + - name: Build executable run: | pip install requests requests[socks] yt-dlp pyyaml ${{ matrix.python-packages }} pyinstaller - python scripts/pyinstaller.py + python ./scripts/pyinstaller.py --os '${{ matrix.os }}' --arch '${{ matrix.architecture }}' - - name: Upload executable - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: - name: gallery-dl-${{ matrix.os }}-${{ matrix.architecture }}-${{ matrix.python-version }} - path: | - dist + name: executable-${{ matrix.os }}-${{ matrix.architecture }}-${{ matrix.python-version }} + path: dist/* + retention-days: 1 + compression-level: 0 + + release: + + needs: build + runs-on: ubuntu-latest + + steps: + - uses: actions/download-artifact@v4 + + - name: Date + run: echo "DATE=$(date '+${{ env.DATE_FORMAT }}')" >> "$GITHUB_ENV" + + - uses: ncipollo/release-action@v1 + with: + owner: gdl-org + repo: builds + tag: ${{ env.DATE }} + artifacts: "executable-*/*" + allowUpdates: true + makeLatest: true + token: ${{ secrets.REPO_TOKEN }} diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml new file mode 100644 index 00000000..e0335a55 --- /dev/null +++ b/.github/workflows/pages.yml @@ -0,0 +1,56 @@ +name: GitHub Pages + +on: + workflow_dispatch: + push: + branches: + - master + paths: + - docs/** + +permissions: + contents: read + pages: write + id-token: write + +concurrency: + group: pages + cancel-in-progress: false + +jobs: + dispatch: + + runs-on: ubuntu-latest + + steps: + - name: Dispatch to gdl-org/docs + run: > + curl -L + -X POST + -H "Accept: application/vnd.github+json" + -H "Authorization: Bearer ${{ secrets.REPO_TOKEN }}" + -H "X-GitHub-Api-Version: 2022-11-28" + https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches + -d '{"ref":"master"}' + + deploy: + + runs-on: ubuntu-latest + + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + + steps: + - uses: actions/checkout@v4 + - uses: actions/configure-pages@v4 + + - name: Copy static files + run: | + mkdir --parents -- ./_site + cp --archive --target-directory=./_site -- \ + ./docs/oauth-redirect.html + + - uses: actions/upload-pages-artifact@v3 + - uses: actions/deploy-pages@v4 + id: deployment diff --git a/.github/workflows/pages_dispatch.yml b/.github/workflows/pages_dispatch.yml deleted file mode 100644 index 835c1400..00000000 --- a/.github/workflows/pages_dispatch.yml +++ /dev/null @@ -1,23 +0,0 @@ -name: Dispatch GitHub Pages Build - -on: - workflow_dispatch: - push: - branches: - - "master" - paths: - - "docs/**" - -jobs: - dispatch: - runs-on: ubuntu-latest - steps: - - name: dispatch - run: > - curl -L - -X POST - -H "Accept: application/vnd.github+json" - -H "Authorization: Bearer ${{ secrets.DISPATCH_TOKEN }}" - -H "X-GitHub-Api-Version: 2022-11-28" - https://api.github.com/repos/gdl-org/docs/actions/workflows/pages.yml/dispatches - -d '{"ref":"master"}' diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 18abb567..6c031739 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.5", "3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] + python-version: ["3.6", "3.7", "3.8", "3.9", "3.10", "3.11", "3.12", "pypy3.9"] steps: - uses: actions/checkout@v4 @@ -26,7 +26,7 @@ jobs: if [[ "$(find ./gallery_dl -type f -not -perm 644)" ]]; then exit 1; fi - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} diff --git a/CHANGELOG.md b/CHANGELOG.md index f938ab94..8cdcf642 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,89 @@ # Changelog +## 1.26.9 - 2024-03-23 +### Extractors +#### Additions +- [artstation] support video clips ([#2566](https://github.com/mikf/gallery-dl/issues/2566), [#3309](https://github.com/mikf/gallery-dl/issues/3309), [#3911](https://github.com/mikf/gallery-dl/issues/3911)) +- [artstation] support collections ([#146](https://github.com/mikf/gallery-dl/issues/146)) +- [deviantart] recognize `deviantart.com/stash/…` URLs +- [idolcomplex] support new pool URLs +- [lensdump] recognize direct image links ([#5293](https://github.com/mikf/gallery-dl/issues/5293)) +- [skeb] add extractor for followed users ([#5290](https://github.com/mikf/gallery-dl/issues/5290)) +- [twitter] add `quotes` extractor ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) +- [wikimedia] support `azurlane.koumakan.jp` ([#5256](https://github.com/mikf/gallery-dl/issues/5256)) +- [xvideos] support `/channels/` URLs ([#5244](https://github.com/mikf/gallery-dl/issues/5244)) +#### Fixes +- [artstation] fix handling usernames with dashes in domain names ([#5224](https://github.com/mikf/gallery-dl/issues/5224)) +- [bluesky] fix not spawning child extractors for followed users ([#5246](https://github.com/mikf/gallery-dl/issues/5246)) +- [deviantart] handle CloudFront blocks ([#5363](https://github.com/mikf/gallery-dl/issues/5363)) +- [deviantart:avatar] fix `index` for URLs without `?` ([#5276](https://github.com/mikf/gallery-dl/issues/5276)) +- [deviantart:stash] fix `index` values ([#5335](https://github.com/mikf/gallery-dl/issues/5335)) +- [gofile] fix extraction +- [hiperdex] update URL patterns & fix `manga` metadata ([#5340](https://github.com/mikf/gallery-dl/issues/5340)) +- [idolcomplex] fix metadata extraction +- [imagefap] fix folder extraction ([#5333](https://github.com/mikf/gallery-dl/issues/5333)) +- [instagram] make accessing `like_count` non-fatal ([#5218](https://github.com/mikf/gallery-dl/issues/5218)) +- [mastodon] fix handling null `moved` account field ([#5321](https://github.com/mikf/gallery-dl/issues/5321)) +- [naver] fix EUC-KR encoding issue in old image URLs ([#5126](https://github.com/mikf/gallery-dl/issues/5126)) +- [nijie] increase default delay between requests ([#5221](https://github.com/mikf/gallery-dl/issues/5221)) +- [nitter] ignore invalid Tweets ([#5253](https://github.com/mikf/gallery-dl/issues/5253)) +- [pixiv:novel] fix text extraction ([#5285](https://github.com/mikf/gallery-dl/issues/5285), [#5309](https://github.com/mikf/gallery-dl/issues/5309)) +- [skeb] retry 429 responses containing a `request_key` cookie ([#5210](https://github.com/mikf/gallery-dl/issues/5210)) +- [warosu] fix crash for threads with deleted posts ([#5289](https://github.com/mikf/gallery-dl/issues/5289)) +- [weibo] fix retweets ([#2825](https://github.com/mikf/gallery-dl/issues/2825), [#3874](https://github.com/mikf/gallery-dl/issues/3874), [#5263](https://github.com/mikf/gallery-dl/issues/5263)) +- [weibo] fix `livephoto` filename extensions ([#5287](https://github.com/mikf/gallery-dl/issues/5287)) +- [xvideos] fix galleries with more than 500 images ([#5244](https://github.com/mikf/gallery-dl/issues/5244)) +#### Improvements +- [bluesky] improve API error messages +- [bluesky] handle posts with different `embed` structure +- [deviantart:avatar] ignore default avatars ([#5276](https://github.com/mikf/gallery-dl/issues/5276)) +- [fapello] download full-sized images ([#5349](https://github.com/mikf/gallery-dl/issues/5349)) +- [gelbooru:favorite] automatically detect returned post order ([#5220](https://github.com/mikf/gallery-dl/issues/5220)) +- [imgur] fail downloads when redirected to `removed.png` ([#5308](https://github.com/mikf/gallery-dl/issues/5308)) +- [instagram] raise proper error for missing `reels_media` ([#5257](https://github.com/mikf/gallery-dl/issues/5257)) +- [instagram] change `posts are private` exception to a warning ([#5322](https://github.com/mikf/gallery-dl/issues/5322)) +- [reddit] improve preview fallback formats ([#5296](https://github.com/mikf/gallery-dl/issues/5296), [#5315](https://github.com/mikf/gallery-dl/issues/5315)) +- [steamgriddb] raise exception for deleted assets +- [twitter] handle "account is temporarily locked" errors ([#5300](https://github.com/mikf/gallery-dl/issues/5300)) +- [weibo] rework pagination logic ([#4168](https://github.com/mikf/gallery-dl/issues/4168)) +- [zerochan] fetch more posts by using the API ([#3669](https://github.com/mikf/gallery-dl/issues/3669)) +#### Metadata +- [bluesky] add `instance` metadata field ([#4438](https://github.com/mikf/gallery-dl/issues/4438)) +- [gelbooru:favorite] add `date_favorited` metadata field +- [imagefap] extract `folder` metadata ([#5270](https://github.com/mikf/gallery-dl/issues/5270)) +- [instagram] default `likes` to `0` ([#5323](https://github.com/mikf/gallery-dl/issues/5323)) +- [kemonoparty] add `revision_count` metadata field ([#5334](https://github.com/mikf/gallery-dl/issues/5334)) +- [naver] unescape post `title` and `description` +- [pornhub:gif] extract `viewkey` and `timestamp` metadata ([#4463](https://github.com/mikf/gallery-dl/issues/4463)) +- [redgifs] make `date` available for directories ([#5262](https://github.com/mikf/gallery-dl/issues/5262)) +- [subscribestar] fix `date` metadata +- [twitter] add `birdwatch` metadata field ([#5317](https://github.com/mikf/gallery-dl/issues/5317)) +- [twitter] add `protected` metadata field ([#5327](https://github.com/mikf/gallery-dl/issues/5327)) +- [warosu] fix `board_name` metadata +#### Options +- [bluesky] add `reposts` option ([#4438](https://github.com/mikf/gallery-dl/issues/4438), [#5248](https://github.com/mikf/gallery-dl/issues/5248)) +- [deviantart] add `comments-avatars` option ([#4995](https://github.com/mikf/gallery-dl/issues/4995)) +- [deviantart] extend `metadata` option ([#5175](https://github.com/mikf/gallery-dl/issues/5175)) +- [flickr] add `contexts` option ([#5324](https://github.com/mikf/gallery-dl/issues/5324)) +- [gelbooru:favorite] add `order-posts` option ([#5220](https://github.com/mikf/gallery-dl/issues/5220)) +- [kemonoparty] add `order-revisions` option ([#5334](https://github.com/mikf/gallery-dl/issues/5334)) +- [vipergirls] add `like` option ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +- [vipergirls] add `domain` option ([#4166](https://github.com/mikf/gallery-dl/issues/4166)) +### Downloaders +- [http] add MIME type and signature for `.mov` files ([#5287](https://github.com/mikf/gallery-dl/issues/5287)) +### Docker +- build images from source instead of PyPI package +- build `linux/arm64` images ([#5227](https://github.com/mikf/gallery-dl/issues/5227)) +- build images on every push to master + - tag images as `YYYY.MM.DD` + - tag the most recent build from master as `dev` + - tag the most recent release build as `latest` +- reduce image size ([#5097](https://github.com/mikf/gallery-dl/issues/5097)) +### Miscellaneous +- [formatter] fix local DST datetime offsets for `:O` +- build Linux executable on Ubuntu 22.04 LTS ([#4184](https://github.com/mikf/gallery-dl/issues/4184)) +- automatically create directories for logging files ([#5249](https://github.com/mikf/gallery-dl/issues/5249)) + ## 1.26.8 - 2024-02-17 ### Extractors #### Additions diff --git a/Dockerfile b/Dockerfile index 77e97cd9..30759122 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,21 @@ FROM python:alpine -RUN python3 -m pip install --no-cache-dir -U pip && \ - python3 -m pip install --no-cache-dir -U gallery-dl yt-dlp -RUN apk update && \ - apk add --no-cache ffmpeg && \ - rm -rf /var/cache/apk/* +ENV LANG=C.UTF-8 + +RUN : \ + && apk --no-interactive update \ + && apk --no-cache --no-interactive add ffmpeg \ + && rm -rf /var/cache/apk \ + && : + +RUN : \ + && python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \ + pip \ + && python3 -B -m pip --no-cache-dir --no-input --disable-pip-version-check install -U \ + https://github.com/mikf/gallery-dl/archive/refs/heads/master.tar.gz \ + yt-dlp \ + && rm -rf /root/.cache/pip \ + && find /usr/local/lib/python3.*/site-packages/setuptools -name __pycache__ -exec rm -rf {} + \ + && find /usr/local/lib/python3.*/site-packages/wheel -name __pycache__ -exec rm -rf {} + \ + && : + ENTRYPOINT [ "gallery-dl" ] diff --git a/README.rst b/README.rst index 6f6aa025..366db3cb 100644 --- a/README.rst +++ b/README.rst @@ -7,8 +7,8 @@ to download image galleries and collections from several image hosting sites (see `Supported Sites `__). It is a cross-platform tool -with many `configuration options `__ -and powerful `filenaming capabilities `__. +with many `configuration options `__ +and powerful `filenaming capabilities `__. |pypi| |build| @@ -72,9 +72,9 @@ Standalone Executable Prebuilt executable files with a Python interpreter and required Python packages included are available for -- `Windows `__ +- `Windows `__ (Requires `Microsoft Visual C++ Redistributable Package (x86) `__) -- `Linux `__ +- `Linux `__ Nightly Builds @@ -234,7 +234,7 @@ Documentation ------------- A list of all available configuration options and their descriptions -can be found in ``__. +can be found at ``__. | For a default configuration file with available options set to their default values, see ``__. @@ -330,7 +330,7 @@ CAPTCHA or similar, or has not been implemented yet, you can use the cookies from a browser login session and input them into *gallery-dl*. This can be done via the -`cookies `__ +`cookies `__ option in your configuration file by specifying - | the path to a Mozilla/Netscape format cookies.txt file exported by a browser addon diff --git a/docs/_layouts/default.html b/docs/_layouts/default.html new file mode 100644 index 00000000..8658aefa --- /dev/null +++ b/docs/_layouts/default.html @@ -0,0 +1,20 @@ + + + + + + + +{% seo %} + + + + + +
+ + {{ content }} + +
+ + diff --git a/docs/configuration.rst b/docs/configuration.rst index 37f12f13..4eebca2c 100644 --- a/docs/configuration.rst +++ b/docs/configuration.rst @@ -337,6 +337,15 @@ Description filename extension (``file.1.ext``, ``file.2.ext``, etc.) +extractor.*.skip-filter +----------------------- +Type + ``string`` +Description + Python expression controlling which skipped files to count towards + ``"abort"`` / ``"terminate"`` / ``"exit"``. + + extractor.*.sleep ----------------- Type @@ -358,12 +367,39 @@ Description i.e. before starting a new extractor. +extractor.*.sleep-429 +--------------------- +Type + |Duration|_ +Default + ``60`` +Description + Number of seconds to sleep when receiving a `429 Too Many Requests` + response before `retrying `__ the request. + + extractor.*.sleep-request ------------------------- Type |Duration|_ Default - ``0`` + * ``"0.5-1.5"`` + ``[Danbooru]``, ``[E621]``, ``[foolfuuka]:search``, ``itaku``, + ``newgrounds``, ``[philomena]``, ``pixiv:novel``, ``plurk``, + ``poipiku`` , ``pornpics``, ``soundgasm``, ``urlgalleries``, + ``vk``, ``zerochan`` + * ``"1.0-2.0"`` + ``flickr``, ``weibo``, ``[wikimedia]`` + * ``"2.0-4.0"`` + ``behance``, ``imagefap``, ``[Nijie]`` + * ``"3.0-6.0"`` + ``exhentai``, ``idolcomplex``, ``[reactor]``, ``readcomiconline`` + * ``"6.0-6.1"`` + ``twibooru`` + * ``"6.0-12.0"`` + ``instagram`` + * ``0`` + otherwise Description Minimal time interval in seconds between each HTTP request during data extraction. @@ -382,6 +418,7 @@ Description Specifying username and password is required for * ``nijie`` + * ``horne`` and optional for @@ -389,8 +426,12 @@ Description * ``aryion`` * ``atfbooru`` (*) * ``bluesky`` + * ``booruvar`` (*) + * ``coomerparty`` * ``danbooru`` (*) + * ``deviantart`` * ``e621`` (*) + * ``e6ai`` (*) * ``e926`` (*) * ``exhentai`` * ``idolcomplex`` @@ -401,7 +442,6 @@ Description * ``mangoxo`` * ``pillowfort`` * ``sankaku`` - * ``seisoparty`` * ``subscribestar`` * ``tapas`` * ``tsumino`` @@ -417,7 +457,7 @@ Description the API key found in your user profile, not the actual account password. Note: Leave the ``password`` value empty or undefined - to get prompted for a passeword when performing a login + to be prompted for a passeword when performing a login (see `getpass() `__). @@ -557,8 +597,8 @@ extractor.*.browser Type ``string`` Default - * ``"firefox"`` for ``patreon``, ``mangapark``, and ``mangasee`` - * ``null`` everywhere else + * ``"firefox"``: ``artstation``, ``mangasee``, ``patreon``, ``pixiv:series``, ``twitter`` + * ``null``: otherwise Example * ``"chrome:macos"`` Description @@ -633,8 +673,8 @@ extractor.*.tls12 Type ``bool`` Default - * ``true`` - * ``false`` for ``patreon``, ``pixiv:series`` + * ``false``: ``patreon``, ``pixiv:series`` + * ``true``: otherwise Description Allow selecting TLS 1.2 cipher suites. @@ -813,6 +853,22 @@ Description An alternative `format string`_ to build archive IDs with. +extractor.*.archive-mode +------------------------ +Type + ``string`` +Default + ``"file"`` +Description + Controls when to write `archive IDs `__ + to the archive database. + + * ``"file"``: Write IDs immediately + after completing or skipping a file download. + * ``"memory"``: Keep IDs in memory + and only write them after successful job completion. + + extractor.*.archive-prefix -------------------------- Type @@ -836,6 +892,65 @@ Description for available ``PRAGMA`` statements and further details. +extractor.*.actions +------------------- +Type + * ``object`` (`pattern` -> `action`) + * ``list`` of ``lists`` with 2 ``strings`` as elements +Example + .. code:: json + + { + "error" : "status |= 1", + "warning:(?i)unable to .+": "exit 127", + "info:Logging in as .+" : "level = debug" + } + + .. code:: json + + [ + ["error" , "status |= 1" ], + ["warning:(?i)unable to .+", "exit 127" ], + ["info:Logging in as .+" , "level = debug"] + ] + +Description + Perform an ``action`` when logging a message matched by ``pattern``. + + ``pattern`` is parsed as severity level (``debug``, ``info``, ``warning``, ``error``, or integer value) + followed by an optional `Python Regular Expression `__ + separated by a colon ``:``. + Using ``*`` as `level` or leaving it empty + matches logging messages of all levels + (e.g. ``*:`` or ``:``). + + ``action`` is parsed as action type + followed by (optional) arguments. + + Supported Action Types: + + ``status``: + | Modify job exit status. + | Expected syntax is `` `` (e.g. ``= 100``). + + Supported operators are + ``=`` (assignment), + ``&`` (bitwise AND), + ``|`` (bitwise OR), + ``^`` (bitwise XOR). + ``level``: + | Modify severity level of the current logging message. + | Can be one of ``debug``, ``info``, ``warning``, ``error`` or an integer value. + ``print`` + Write argument to stdout. + ``restart``: + Restart the current extractor run. + ``wait``: + Stop execution until Enter is pressed. + ``exit``: + Exit the program with the given argument as exit status. + + extractor.*.postprocessors -------------------------- Type @@ -1872,6 +1987,20 @@ Description from `linking your Flickr account to gallery-dl `__. +extractor.flickr.contexts +------------------------- +Type + ``bool`` +Default + ``false`` +Description + For each photo, return the albums and pools it belongs to + as ``set`` and ``pool`` metadata. + + Note: This requires 1 additional API call per photo. + See `flickr.photos.getAllContexts `__ for details. + + extractor.flickr.exif --------------------- Type @@ -1879,9 +2008,11 @@ Type Default ``false`` Description - Fetch `exif` and `camera` metadata for each photo. + For each photo, return its EXIF/TIFF/GPS tags + as ``exif`` and ``camera`` metadata. Note: This requires 1 additional API call per photo. + See `flickr.photos.getExif `__ for details. extractor.flickr.metadata @@ -1901,7 +2032,7 @@ Description It is possible to specify a custom list of metadata includes. See `the extras parameter `__ - in `Flickr API docs `__ + in `Flickr's API docs `__ for possible field names. @@ -2001,6 +2132,20 @@ Description page. +extractor.gelbooru.favorite.order-posts +--------------------------------------- +Type + ``string`` +Default + ``"desc"`` +Description + Controls the order in which favorited posts are returned. + + * ``"asc"``: Ascending favorite date order (oldest first) + * ``"desc"``: Descending favorite date order (newest first) + * ``"reverse"``: Same as ``"asc"`` + + extractor.generic.enabled ------------------------- Type @@ -2287,6 +2432,16 @@ Description Extract a user's direct messages as ``dms`` metadata. +extractor.kemonoparty.announcements +----------------------------------- +Type + ``bool`` +Default + ``false`` +Description + Extract a user's announcements as ``announcements`` metadata. + + extractor.kemonoparty.favorites ------------------------------- Type @@ -2346,6 +2501,22 @@ Description Note: This requires 1 additional HTTP request per post. +extractor.kemonoparty.order-revisions +------------------------------------- +Type + ``string`` +Default + ``"desc"`` +Description + Controls the order in which + `revisions `__ + are returned. + + * ``"asc"``: Ascending order (oldest first) + * ``"desc"``: Descending order (newest first) + * ``"reverse"``: Same as ``"asc"`` + + extractor.khinsider.format -------------------------- Type @@ -2470,6 +2641,16 @@ Description user IDs. +extractor.[mastodon].cards +-------------------------- +Type + ``bool`` +Default + ``false`` +Description + Fetch media from cards. + + extractor.[mastodon].reblogs ---------------------------- Type @@ -2829,14 +3010,24 @@ Description `gppt `__. -extractor.pixiv.embeds ----------------------- +extractor.pixiv.novel.covers +---------------------------- Type ``bool`` Default ``false`` Description - Download images embedded in novels. + Download cover images. + + +extractor.pixiv.novel.embeds +---------------------------- +Type + ``bool`` +Default + ``false`` +Description + Download embedded images. extractor.pixiv.novel.full-series @@ -3286,7 +3477,7 @@ Examples * ``["jpeg", "webp"]`` Description Only include assets that are in the specified file types. ``all`` can be - used to specifiy all file types. Valid values are: + used to specify all file types. Valid values are: * Grids: ``png``, ``jpeg``, ``jpg``, ``webp`` * Heroes: ``png``, ``jpeg``, ``jpg``, ``webp`` @@ -3326,7 +3517,7 @@ Examples * ``["fr", "it"]`` Description Only include assets that are in the specified languages. ``all`` can be - used to specifiy all languages. Valid values are `ISO 639-1 `__ + used to specify all languages. Valid values are `ISO 639-1 `__ language codes. @@ -3771,6 +3962,32 @@ Description * ``"wait"``: Wait until rate limit reset +extractor.twitter.relogin +------------------------- +Type + ``bool`` +Default + ``true`` +Description + | When receiving a "Could not authenticate you" error while logged in with + `username & passeword `__, + | refresh the current login session and + try to continue from where it left off. + + +extractor.twitter.locked +------------------------ +Type + ``string`` +Default + ``"abort"`` +Description + Selects how to handle "account is temporarily locked" errors. + + * ``"abort"``: Raise an error and stop extraction + * ``"wait"``: Wait until the account is unlocked and retry + + extractor.twitter.replies ------------------------- Type @@ -3909,6 +4126,31 @@ Description ``"raw"``, ``"full"``, ``"regular"``, ``"small"``, and ``"thumb"``. +extractor.vipergirls.domain +--------------------------- +Type + ``string`` +Default + ``"vipergirls.to"`` +Description + Specifies the domain used by ``vipergirls`` extractors. + + For example ``"viper.click"`` if the main domain is blocked or to bypass Cloudflare, + + +extractor.vipergirls.like +------------------------- +Type + ``bool`` +Default + ``false`` +Description + Automatically `like` posts after downloading their images. + + Note: Requires `login `__ + or `cookies `__ + + extractor.vsco.videos --------------------- Type @@ -4039,7 +4281,7 @@ extractor.weibo.retweets Type ``bool`` Default - ``true`` + ``false`` Description Fetch media from retweeted posts. @@ -4714,10 +4956,33 @@ output.colors Type ``object`` (`key` -> `ANSI color`) Default - ``{"success": "1;32", "skip": "2"}`` + .. code:: json + + { + "success": "1;32", + "skip" : "2", + "debug" : "0;37", + "info" : "1;37", + "warning": "1;33", + "error" : "1;31" + } + Description - Controls the `ANSI colors `__ - used with |mode: color|__ for successfully downloaded or skipped files. + Controls the + `ANSI colors `__ + used for various outputs. + + Output for |mode: color|__ + + * ``success``: successfully downloaded files + * ``skip``: skipped files + + Logging Messages: + + * ``debug``: debug logging messages + * ``info``: info logging messages + * ``warning``: warning logging messages + * ``error``: error logging messages .. __: `output.mode`_ @@ -4727,7 +4992,7 @@ output.ansi Type ``bool`` Default - ``false`` + ``true`` Description | On Windows, enable ANSI escape sequences and colored output | by setting the ``ENABLE_VIRTUAL_TERMINAL_PROCESSING`` flag for stdout and stderr. @@ -5784,7 +6049,7 @@ How To * choose a name * select "installed app" * set ``http://localhost:6414/`` as "redirect uri" - * solve the "I'm not a rebot" reCATCHA if needed + * solve the "I'm not a robot" reCAPTCHA if needed * click "create app" * copy the client id (third line, under your application's name and @@ -5932,7 +6197,7 @@ Description * format * General format string for logging messages - or a dictionary with format strings for each loglevel. + or an ``object`` with format strings for each loglevel. In addition to the default `LogRecord attributes `__, diff --git a/docs/links.js b/docs/links.js new file mode 100644 index 00000000..487907b9 --- /dev/null +++ b/docs/links.js @@ -0,0 +1,44 @@ +"use strict"; + + +function add_header_links() +{ + let style = document.createElement("style"); + style.id = "headerlinks" + document.head.appendChild(style); + style.sheet.insertRule( + "a.headerlink {" + + " visibility: hidden;" + + " text-decoration: none;" + + " font-size: 0.8em;" + + " padding: 0 4px 0 4px;" + + "}"); + style.sheet.insertRule( + ":hover > a.headerlink {" + + " visibility: visible;" + + "}"); + + let headers = document.querySelectorAll("h2, h3, h4, h5, h6"); + for (let i = 0, len = headers.length; i < len; ++i) + { + let header = headers[i]; + + let id = header.id || header.parentNode.id; + if (!id) + continue; + + let link = document.createElement("a"); + link.href = "#" + id; + link.className = "headerlink"; + link.textContent = "¶"; + + header.appendChild(link); + } +} + + +if (document.readyState !== "loading") { + add_header_links(); +} else { + document.addEventListener("DOMContentLoaded", add_header_links); +} diff --git a/docs/options.md b/docs/options.md index 45ce7eca..5b2de40b 100644 --- a/docs/options.md +++ b/docs/options.md @@ -29,6 +29,7 @@ ## Output Options: -q, --quiet Activate quiet mode + -w, --warning Print only warnings and errors -v, --verbose Print various debugging information -g, --get-urls Print URLs instead of downloading -G, --resolve-urls Print URLs instead of downloading; resolve @@ -48,12 +49,12 @@ extractors but cannot be handled, to FILE --write-pages Write downloaded intermediary pages to files in the current directory to debug problems + --no-colors Do not emit ANSI color codes in output ## Downloader Options: -r, --limit-rate RATE Maximum download rate (e.g. 500k or 2.5M) -R, --retries N Maximum number of retries for failed HTTP - requests or -1 for infinite retries (default: - 4) + requests or -1 for infinite retries (default: 4) --http-timeout SECONDS Timeout for HTTP connections (default: 30.0) --sleep SECONDS Number of seconds to wait before each download. This can be either a constant value or a range diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 312cdc23..034c8c6e 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -790,7 +790,7 @@ Consider all listed sites to potentially be NSFW. Skeb https://skeb.jp/ - Followed Users, Posts, Search Results, User Profiles + Followed Creators, Followed Users, Posts, Search Results, User Profiles @@ -838,7 +838,7 @@ Consider all listed sites to potentially be NSFW. Tapas https://tapas.io/ - Episodes, Series + Creators, Episodes, Series Supported @@ -898,7 +898,7 @@ Consider all listed sites to potentially be NSFW. Twitter https://twitter.com/ - Avatars, Backgrounds, Bookmarks, Communities, Events, Followed Users, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Search Results, Timelines, Tweets, User Profiles + Avatars, Backgrounds, Bookmarks, Communities, Events, Followed Users, Hashtags, individual Images, Likes, Lists, List Members, Media Timelines, Quotes, Search Results, Timelines, Tweets, User Profiles Supported @@ -940,14 +940,14 @@ Consider all listed sites to potentially be NSFW. VSCO https://vsco.co/ - Collections, individual Images, Spaces, User Profiles + Avatars, Collections, individual Images, Spaces, User Profiles Wallhaven https://wallhaven.cc/ Collections, individual Images, Search Results, User Profiles - API Key + API Key Wallpaper Cave @@ -965,7 +965,7 @@ Consider all listed sites to potentially be NSFW. Weasyl https://www.weasyl.com/ Favorites, Folders, Journals, Submissions - API Key + API Key webmshare @@ -1103,7 +1103,7 @@ Consider all listed sites to potentially be NSFW. Booruvar https://booru.borvar.art/ Pools, Popular Images, Posts, Tag Searches - + Supported @@ -1125,7 +1125,7 @@ Consider all listed sites to potentially be NSFW. e6AI https://e6ai.net/ Favorites, Pools, Popular Images, Posts, Tag Searches - + Supported @@ -1319,7 +1319,7 @@ Consider all listed sites to potentially be NSFW. Derpibooru https://derpibooru.org/ Galleries, Posts, Search Results - API Key + API Key Ponybooru @@ -1331,7 +1331,7 @@ Consider all listed sites to potentially be NSFW. Furbooru https://furbooru.org/ Galleries, Posts, Search Results - + API Key @@ -1499,6 +1499,12 @@ Consider all listed sites to potentially be NSFW. Articles + + wiki.gg + https://www.wiki.gg/ + Articles + + Super Mario Wiki https://www.mariowiki.com/ @@ -1616,19 +1622,19 @@ Consider all listed sites to potentially be NSFW. mastodon.social https://mastodon.social/ - Bookmarks, Followed Users, Images from Statuses, User Profiles + Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles OAuth Pawoo https://pawoo.net/ - Bookmarks, Followed Users, Images from Statuses, User Profiles + Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles OAuth baraag https://baraag.net/ - Bookmarks, Followed Users, Images from Statuses, User Profiles + Bookmarks, Favorites, Followed Users, Hashtags, Lists, Images from Statuses, User Profiles OAuth diff --git a/gallery_dl/__init__.py b/gallery_dl/__init__.py index 19ea77b2..7ca405aa 100644 --- a/gallery_dl/__init__.py +++ b/gallery_dl/__init__.py @@ -38,6 +38,11 @@ def main(): except ImportError: import toml config.load(args.configs_toml, strict=True, loads=toml.loads) + if not args.colors: + output.ANSI = False + config.set((), "colors", False) + if util.WINDOWS: + config.set(("output",), "ansi", False) if args.filename: filename = args.filename if filename == "/O": @@ -86,7 +91,7 @@ def main(): signal.signal(signal_num, signal.SIG_IGN) # enable ANSI escape sequences on Windows - if util.WINDOWS and config.get(("output",), "ansi"): + if util.WINDOWS and config.get(("output",), "ansi", output.COLORS): from ctypes import windll, wintypes, byref kernel32 = windll.kernel32 mode = wintypes.DWORD() @@ -113,7 +118,7 @@ def main(): # loglevels output.configure_logging(args.loglevel) - if args.loglevel >= logging.ERROR: + if args.loglevel >= logging.WARNING: config.set(("output",), "mode", "null") config.set(("downloader",), "progress", None) elif args.loglevel <= logging.DEBUG: diff --git a/gallery_dl/archive.py b/gallery_dl/archive.py new file mode 100644 index 00000000..5f05bbfd --- /dev/null +++ b/gallery_dl/archive.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- + +# Copyright 2024 Mike Fährmann +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License version 2 as +# published by the Free Software Foundation. + +"""Download Archives""" + +import os +import sqlite3 +from . import formatter + + +class DownloadArchive(): + + def __init__(self, path, format_string, pragma=None, + cache_key="_archive_key"): + try: + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + except sqlite3.OperationalError: + os.makedirs(os.path.dirname(path)) + con = sqlite3.connect(path, timeout=60, check_same_thread=False) + con.isolation_level = None + + self.keygen = formatter.parse(format_string).format_map + self.connection = con + self.close = con.close + self.cursor = cursor = con.cursor() + self._cache_key = cache_key + + if pragma: + for stmt in pragma: + cursor.execute("PRAGMA " + stmt) + + try: + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY) WITHOUT ROWID") + except sqlite3.OperationalError: + # fallback for missing WITHOUT ROWID support (#553) + cursor.execute("CREATE TABLE IF NOT EXISTS archive " + "(entry TEXT PRIMARY KEY)") + + def add(self, kwdict): + """Add item described by 'kwdict' to archive""" + key = kwdict.get(self._cache_key) or self.keygen(kwdict) + self.cursor.execute( + "INSERT OR IGNORE INTO archive (entry) VALUES (?)", (key,)) + + def check(self, kwdict): + """Return True if the item described by 'kwdict' exists in archive""" + key = kwdict[self._cache_key] = self.keygen(kwdict) + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() + + def finalize(self): + pass + + +class DownloadArchiveMemory(DownloadArchive): + + def __init__(self, path, format_string, pragma=None, + cache_key="_archive_key"): + DownloadArchive.__init__(self, path, format_string, pragma, cache_key) + self.keys = set() + + def add(self, kwdict): + self.keys.add( + kwdict.get(self._cache_key) or + self.keygen(kwdict)) + + def check(self, kwdict): + key = kwdict[self._cache_key] = self.keygen(kwdict) + if key in self.keys: + return True + self.cursor.execute( + "SELECT 1 FROM archive WHERE entry=? LIMIT 1", (key,)) + return self.cursor.fetchone() + + def finalize(self): + if not self.keys: + return + + cursor = self.cursor + with self.connection: + try: + cursor.execute("BEGIN") + except sqlite3.OperationalError: + pass + + stmt = "INSERT OR IGNORE INTO archive (entry) VALUES (?)" + if len(self.keys) < 100: + for key in self.keys: + cursor.execute(stmt, (key,)) + else: + cursor.executemany(stmt, ((key,) for key in self.keys)) diff --git a/gallery_dl/cookies.py b/gallery_dl/cookies.py index 478abb63..b4986c1e 100644 --- a/gallery_dl/cookies.py +++ b/gallery_dl/cookies.py @@ -10,7 +10,6 @@ # https://github.com/yt-dlp/yt-dlp/blob/master/yt_dlp/cookies.py import binascii -import contextlib import ctypes import logging import os @@ -147,7 +146,8 @@ def load_cookies_chrome(cookiejar, browser_name, profile=None, set_cookie(Cookie( 0, name, value, None, False, domain, bool(domain), domain.startswith("."), - path, bool(path), secure, expires, False, None, None, {}, + path, bool(path), secure, expires or None, False, + None, None, {}, )) if failed_cookies > 0: @@ -682,7 +682,8 @@ def _get_gnome_keyring_password(browser_keyring_name): # lists all keys and presumably searches for its key in the list. # It appears that we must do the same. # https://github.com/jaraco/keyring/issues/556 - with contextlib.closing(secretstorage.dbus_init()) as con: + con = secretstorage.dbus_init() + try: col = secretstorage.get_default_collection(con) label = browser_keyring_name + " Safe Storage" for item in col.get_all_items(): @@ -691,6 +692,8 @@ def _get_gnome_keyring_password(browser_keyring_name): else: _log_error("Failed to read from GNOME keyring") return b"" + finally: + con.close() def _get_linux_keyring_password(browser_keyring_name, keyring): @@ -857,7 +860,7 @@ class DatabaseConnection(): def Popen_communicate(*args): - proc = subprocess.Popen( + proc = util.Popen( args, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL) try: stdout, stderr = proc.communicate() @@ -999,6 +1002,12 @@ def _decrypt_windows_dpapi(ciphertext): def _find_most_recently_used_file(root, filename): + # if the provided root points to an exact profile path + # check if it contains the wanted filename + first_choice = os.path.join(root, filename) + if os.path.exists(first_choice): + return first_choice + # if there are multiple browser profiles, take the most recently used one paths = [] for curr_root, dirs, files in os.walk(root): diff --git a/gallery_dl/downloader/http.py b/gallery_dl/downloader/http.py index f1d2c4a8..54750ac7 100644 --- a/gallery_dl/downloader/http.py +++ b/gallery_dl/downloader/http.py @@ -98,6 +98,8 @@ class HttpDownloader(DownloaderBase): metadata = self.metadata kwdict = pathfmt.kwdict + expected_status = kwdict.get( + "_http_expected_status", ()) adjust_extension = kwdict.get( "_http_adjust_extension", self.adjust_extension) @@ -151,7 +153,7 @@ class HttpDownloader(DownloaderBase): # check response code = response.status_code - if code == 200: # OK + if code == 200 or code in expected_status: # OK offset = 0 size = response.headers.get("Content-Length") elif code == 206: # Partial Content @@ -399,6 +401,9 @@ MIME_TYPES = { "video/webm": "webm", "video/ogg" : "ogg", "video/mp4" : "mp4", + "video/m4v" : "m4v", + "video/x-m4v": "m4v", + "video/quicktime": "mov", "audio/wav" : "wav", "audio/x-wav": "wav", @@ -440,7 +445,9 @@ SIGNATURE_CHECKS = { "cur" : lambda s: s[0:4] == b"\x00\x00\x02\x00", "psd" : lambda s: s[0:4] == b"8BPS", "mp4" : lambda s: (s[4:8] == b"ftyp" and s[8:11] in ( - b"mp4", b"avc", b"iso", b"M4V")), + b"mp4", b"avc", b"iso")), + "m4v" : lambda s: s[4:11] == b"ftypM4V", + "mov" : lambda s: s[4:12] == b"ftypqt ", "webm": lambda s: s[0:4] == b"\x1A\x45\xDF\xA3", "ogg" : lambda s: s[0:4] == b"OggS", "wav" : lambda s: (s[0:4] == b"RIFF" and diff --git a/gallery_dl/extractor/8chan.py b/gallery_dl/extractor/8chan.py index fc16f43c..a4b09977 100644 --- a/gallery_dl/extractor/8chan.py +++ b/gallery_dl/extractor/8chan.py @@ -26,6 +26,9 @@ class _8chanExtractor(Extractor): self.root = "https://8chan." + match.group(1) Extractor.__init__(self, match) + def _init(self): + self.cookies.set("TOS", "1", domain=self.root.rpartition("/")[2]) + @memcache() def cookies_prepare(self): # fetch captcha cookies diff --git a/gallery_dl/extractor/bluesky.py b/gallery_dl/extractor/bluesky.py index f57651c0..c97bf65e 100644 --- a/gallery_dl/extractor/bluesky.py +++ b/gallery_dl/extractor/bluesky.py @@ -40,6 +40,7 @@ class BlueskyExtractor(Extractor): self.api = BlueskyAPI(self) self._user = self._user_did = None + self.instance = self.root.partition("://")[2] def items(self): for post in self.posts(): @@ -81,6 +82,7 @@ class BlueskyExtractor(Extractor): if self._metadata_user: post["user"] = self._user or post["author"] + post["instance"] = self.instance post["post_id"] = pid post["count"] = len(images) post["date"] = text.parse_datetime( @@ -315,7 +317,7 @@ class BlueskyAPI(): def get_author_feed(self, actor, filter="posts_and_author_threads"): endpoint = "app.bsky.feed.getAuthorFeed" params = { - "actor" : self._did_from_actor(actor), + "actor" : self._did_from_actor(actor, True), "filter": filter, "limit" : "100", } @@ -325,7 +327,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getFeed" params = { "feed" : "at://{}/app.bsky.feed.generator/{}".format( - self._did_from_actor(actor, False), feed), + self._did_from_actor(actor), feed), "limit": "100", } return self._pagination(endpoint, params) @@ -342,7 +344,7 @@ class BlueskyAPI(): endpoint = "app.bsky.feed.getListFeed" params = { "list" : "at://{}/app.bsky.graph.list/{}".format( - self._did_from_actor(actor, False), list), + self._did_from_actor(actor), list), "limit": "100", } return self._pagination(endpoint, params) @@ -389,7 +391,7 @@ class BlueskyAPI(): } return self._pagination(endpoint, params, "posts") - def _did_from_actor(self, actor, user_did=True): + def _did_from_actor(self, actor, user_did=False): if actor.startswith("did:"): did = actor else: diff --git a/gallery_dl/extractor/bunkr.py b/gallery_dl/extractor/bunkr.py index 1a0e47d7..a0933474 100644 --- a/gallery_dl/extractor/bunkr.py +++ b/gallery_dl/extractor/bunkr.py @@ -54,7 +54,6 @@ class BunkrAlbumExtractor(LolisafeAlbumExtractor): "album_id" : self.album_id, "album_name" : text.unescape(info[0]), "album_size" : size[1:-1], - "description": text.unescape(info[2]) if len(info) > 2 else "", "count" : len(urls), } diff --git a/gallery_dl/extractor/common.py b/gallery_dl/extractor/common.py index cf0f8c90..d80dea2a 100644 --- a/gallery_dl/extractor/common.py +++ b/gallery_dl/extractor/common.py @@ -14,6 +14,7 @@ import ssl import time import netrc import queue +import getpass import logging import datetime import requests @@ -21,6 +22,7 @@ import threading from requests.adapters import HTTPAdapter from .message import Message from .. import config, text, util, cache, exception +urllib3 = requests.packages.urllib3 class Extractor(): @@ -45,6 +47,8 @@ class Extractor(): def __init__(self, match): self.log = logging.getLogger(self.category) self.url = match.string + self.match = match + self.groups = match.groups() self._cfgpath = ("extractor", self.category, self.subcategory) self._parentdir = "" @@ -168,22 +172,25 @@ class Extractor(): requests.exceptions.ChunkedEncodingError, requests.exceptions.ContentDecodingError) as exc: msg = exc + code = 0 except (requests.exceptions.RequestException) as exc: raise exception.HttpError(exc) else: code = response.status_code if self._write_pages: self._dump_response(response) - if 200 <= code < 400 or fatal is None and \ - (400 <= code < 500) or not fatal and \ - (400 <= code < 429 or 431 <= code < 500): + if ( + code < 400 or + code < 500 and (not fatal and code != 429 or fatal is None) + ): if encoding: response.encoding = encoding return response if notfound and code == 404: raise exception.NotFoundError(notfound) - msg = "'{} {}' for '{}'".format(code, response.reason, url) + msg = "'{} {}' for '{}'".format( + code, response.reason, response.url) server = response.headers.get("Server") if server and server.startswith("cloudflare") and \ code in (403, 503): @@ -194,7 +201,10 @@ class Extractor(): if b'name="captcha-bypass"' in content: self.log.warning("Cloudflare CAPTCHA") break - if code not in retry_codes and code < 500: + + if code == 429 and self._interval_429: + pass + elif code not in retry_codes and code < 500: break finally: @@ -203,15 +213,25 @@ class Extractor(): self.log.debug("%s (%s/%s)", msg, tries, retries+1) if tries > retries: break - self.sleep( - max(tries, self._interval()) if self._interval else tries, - "retry") + + seconds = tries + if self._interval: + s = self._interval() + if seconds < s: + seconds = s + if code == 429 and self._interval_429: + s = self._interval_429() + if seconds < s: + seconds = s + self.wait(seconds=seconds, reason="429 Too Many Requests") + else: + self.sleep(seconds, "retry") tries += 1 raise exception.HttpError(msg, response) def wait(self, seconds=None, until=None, adjust=1.0, - reason="rate limit reset"): + reason="rate limit"): now = time.time() if seconds: @@ -234,7 +254,7 @@ class Extractor(): if reason: t = datetime.datetime.fromtimestamp(until).time() isotime = "{:02}:{:02}:{:02}".format(t.hour, t.minute, t.second) - self.log.info("Waiting until %s for %s.", isotime, reason) + self.log.info("Waiting until %s (%s)", isotime, reason) time.sleep(seconds) def sleep(self, seconds, reason): @@ -242,6 +262,15 @@ class Extractor(): seconds, reason) time.sleep(seconds) + def input(self, prompt, echo=True): + if echo: + try: + return input(prompt) + except (EOFError, OSError): + return None + else: + return getpass.getpass(prompt) + def _get_auth_info(self): """Return authentication information as (username, password) tuple""" username = self.config("username") @@ -274,6 +303,9 @@ class Extractor(): self.config("sleep-request", self.request_interval), self.request_interval_min, ) + self._interval_429 = util.build_duration_func( + self.config("sleep-429", 60), + ) if self._retries < 0: self._retries = float("inf") @@ -433,9 +465,11 @@ class Extractor(): if not path: return + path_tmp = path + ".tmp" try: - with open(path, "w") as fp: + with open(path_tmp, "w") as fp: util.cookiestxt_store(fp, self.cookies) + os.replace(path_tmp, path) except OSError as exc: self.log.warning("cookies: %s", exc) @@ -593,7 +627,7 @@ class GalleryExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.gallery_url = self.root + match.group(1) if url is None else url + self.gallery_url = self.root + self.groups[0] if url is None else url def items(self): self.login() @@ -668,7 +702,7 @@ class MangaExtractor(Extractor): def __init__(self, match, url=None): Extractor.__init__(self, match) - self.manga_url = url or self.root + match.group(1) + self.manga_url = self.root + self.groups[0] if url is None else url if self.config("chapter-reverse", False): self.reverse = not self.reverse @@ -730,17 +764,18 @@ class BaseExtractor(Extractor): instances = () def __init__(self, match): - if not self.category: - self._init_category(match) Extractor.__init__(self, match) + if not self.category: + self._init_category() + self._cfgpath = ("extractor", self.category, self.subcategory) - def _init_category(self, match): - for index, group in enumerate(match.groups()): + def _init_category(self): + for index, group in enumerate(self.groups): if group is not None: if index: self.category, self.root, info = self.instances[index-1] if not self.root: - self.root = text.root_from_url(match.group(0)) + self.root = text.root_from_url(self.match.group(0)) self.config_instance = info.get else: self.root = group @@ -800,12 +835,9 @@ def _build_requests_adapter(ssl_options, ssl_ciphers, source_address): pass if ssl_options or ssl_ciphers: - ssl_context = ssl.create_default_context() - if ssl_options: - ssl_context.options |= ssl_options - if ssl_ciphers: - ssl_context.set_ecdh_curve("prime256v1") - ssl_context.set_ciphers(ssl_ciphers) + ssl_context = urllib3.connection.create_urllib3_context( + options=ssl_options or None, ciphers=ssl_ciphers) + ssl_context.check_hostname = False else: ssl_context = None @@ -925,8 +957,6 @@ SSL_CIPHERS = { } -urllib3 = requests.packages.urllib3 - # detect brotli support try: BROTLI = urllib3.response.brotli is not None diff --git a/gallery_dl/extractor/deviantart.py b/gallery_dl/extractor/deviantart.py index 08961614..993885ab 100644 --- a/gallery_dl/extractor/deviantart.py +++ b/gallery_dl/extractor/deviantart.py @@ -18,12 +18,12 @@ import binascii import time import re - BASE_PATTERN = ( r"(?:https?://)?(?:" r"(?:www\.)?(?:fx)?deviantart\.com/(?!watch/)([\w-]+)|" r"(?!www\.)([\w-]+)\.(?:fx)?deviantart\.com)" ) +DEFAULT_AVATAR = "https://a.deviantart.net/avatars/default.gif" class DeviantartExtractor(Extractor): @@ -84,6 +84,16 @@ class DeviantartExtractor(Extractor): else: self.commit_journal = None + def request(self, url, **kwargs): + if "fatal" not in kwargs: + kwargs["fatal"] = False + while True: + response = Extractor.request(self, url, **kwargs) + if response.status_code != 403 or \ + b"Request blocked." not in response.content: + return response + self.wait(seconds=300, reason="CloudFront block") + def skip(self, num): self.offset += num return num @@ -177,6 +187,10 @@ class DeviantartExtractor(Extractor): for comment in deviation["comments"]: user = comment["user"] name = user["username"].lower() + if user["usericon"] == DEFAULT_AVATAR: + self.log.debug( + "Skipping avatar of '%s' (default)", name) + continue _user_details.update(name, user) url = "{}/{}/avatar/".format(self.root, name) @@ -209,7 +223,9 @@ class DeviantartExtractor(Extractor): """Adjust the contents of a Deviation-object""" if "index" not in deviation: try: - if deviation["url"].startswith("https://sta.sh"): + if deviation["url"].startswith(( + "https://www.deviantart.com/stash/", "https://sta.sh", + )): filename = deviation["content"]["src"].split("/")[5] deviation["index_base36"] = filename.partition("-")[0][1:] deviation["index"] = id_from_base36( @@ -456,18 +472,12 @@ class DeviantartExtractor(Extractor): def _limited_request(self, url, **kwargs): """Limits HTTP requests to one every 2 seconds""" - kwargs["fatal"] = None diff = time.time() - DeviantartExtractor._last_request if diff < 2.0: self.sleep(2.0 - diff, "request") - - while True: - response = self.request(url, **kwargs) - if response.status_code != 403 or \ - b"Request blocked." not in response.content: - DeviantartExtractor._last_request = time.time() - return response - self.wait(seconds=180) + response = self.request(url, **kwargs) + DeviantartExtractor._last_request = time.time() + return response def _fetch_premium(self, deviation): try: @@ -585,7 +595,13 @@ class DeviantartAvatarExtractor(DeviantartExtractor): return () icon = user["usericon"] - index = icon.rpartition("?")[2] + if icon == DEFAULT_AVATAR: + self.log.debug("Skipping avatar of '%s' (default)", name) + return () + + _, sep, index = icon.rpartition("?") + if not sep: + index = "0" formats = self.config("formats") if not formats: @@ -668,7 +684,8 @@ class DeviantartStashExtractor(DeviantartExtractor): """Extractor for sta.sh-ed deviations""" subcategory = "stash" archive_fmt = "{index}.{extension}" - pattern = r"(?:https?://)?sta\.sh/([a-z0-9]+)" + pattern = (r"(?:https?://)?(?:(?:www\.)?deviantart\.com/stash|sta\.sh)" + r"/([a-z0-9]+)") example = "https://sta.sh/abcde" skip = Extractor.skip @@ -689,7 +706,7 @@ class DeviantartStashExtractor(DeviantartExtractor): if uuid: deviation = self.api.deviation(uuid) deviation["index"] = text.parse_int(text.extr( - page, 'gmi-deviationid="', '"')) + page, '\\"deviationId\\":', ',')) yield deviation return @@ -1405,9 +1422,14 @@ class DeviantartOAuthAPI(): self.authenticate(None if public else self.refresh_token_key) kwargs["headers"] = self.headers response = self.extractor.request(url, **kwargs) - data = response.json() - status = response.status_code + try: + data = response.json() + except ValueError: + self.log.error("Unable to parse API response") + data = {} + + status = response.status_code if 200 <= status < 400: if self.delay > self.delay_min: self.delay -= 1 @@ -1435,9 +1457,8 @@ class DeviantartOAuthAPI(): self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master/do" - "cs/configuration.rst#extractordeviantartclient-id" - "--client-secret") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-deviantart-client-id-client-secret") else: if log: self.log.error(msg) diff --git a/gallery_dl/extractor/exhentai.py b/gallery_dl/extractor/exhentai.py index acad95ce..18054035 100644 --- a/gallery_dl/extractor/exhentai.py +++ b/gallery_dl/extractor/exhentai.py @@ -50,7 +50,7 @@ class ExhentaiExtractor(Extractor): def request(self, url, **kwargs): response = Extractor.request(self, url, **kwargs) - if response.history and response.headers.get("Content-Length") == "0": + if "Cache-Control" not in response.headers and not response.content: self.log.info("blank page") raise exception.AuthorizationError() return response @@ -95,7 +95,11 @@ class ExhentaiExtractor(Extractor): self.cookies.clear() response = self.request(url, method="POST", headers=headers, data=data) - if b"You are now logged in as:" not in response.content: + content = response.content + if b"You are now logged in as:" not in content: + if b"The captcha was not entered correctly" in content: + raise exception.AuthenticationError( + "CAPTCHA required. Use cookies instead.") raise exception.AuthenticationError() # collect more cookies @@ -437,7 +441,7 @@ class ExhentaiGalleryExtractor(ExhentaiExtractor): raise exception.AuthorizationError() if page.startswith(("Key missing", "Gallery not found")): raise exception.NotFoundError("gallery") - if "hentai.org/mpv/" in page: + if page.count("hentai.org/mpv/") > 1: self.log.warning("Enabled Multi-Page Viewer is not supported") return page diff --git a/gallery_dl/extractor/fapello.py b/gallery_dl/extractor/fapello.py index aff8e616..838ae7b6 100644 --- a/gallery_dl/extractor/fapello.py +++ b/gallery_dl/extractor/fapello.py @@ -42,7 +42,8 @@ class FapelloPostExtractor(Extractor): "type" : "video" if 'type="video' in page else "photo", "thumbnail": text.extr(page, 'poster="', '"'), } - url = text.extr(page, 'src="', '"') + url = text.extr(page, 'src="', '"').replace( + ".md", "").replace(".th", "") yield Message.Directory, data yield Message.Url, url, text.nameext_from_url(url, data) diff --git a/gallery_dl/extractor/flickr.py b/gallery_dl/extractor/flickr.py index f7dc3cc2..c94a110a 100644 --- a/gallery_dl/extractor/flickr.py +++ b/gallery_dl/extractor/flickr.py @@ -77,6 +77,8 @@ class FlickrImageExtractor(FlickrExtractor): photo = self.api.photos_getInfo(self.item_id) if self.api.exif: photo.update(self.api.photos_getExif(self.item_id)) + if self.api.contexts: + photo.update(self.api.photos_getAllContexts(self.item_id)) if photo["media"] == "video" and self.api.videos: self.api._extract_video(photo) @@ -268,6 +270,8 @@ class FlickrAPI(oauth.OAuth1API): self.exif = extractor.config("exif", False) self.videos = extractor.config("videos", True) + self.contexts = extractor.config("contexts", False) + self.maxsize = extractor.config("size-max") if isinstance(self.maxsize, str): for fmt, fmtname, fmtwidth in self.FORMATS: @@ -311,6 +315,13 @@ class FlickrAPI(oauth.OAuth1API): params = {"user_id": user_id} return self._pagination("people.getPhotos", params) + def photos_getAllContexts(self, photo_id): + """Returns all visible sets and pools the photo belongs to.""" + params = {"photo_id": photo_id} + data = self._call("photos.getAllContexts", params) + del data["stat"] + return data + def photos_getExif(self, photo_id): """Retrieves a list of EXIF/TIFF/GPS tags for a given photo.""" params = {"photo_id": photo_id} @@ -444,6 +455,8 @@ class FlickrAPI(oauth.OAuth1API): if self.exif: photo.update(self.photos_getExif(photo["id"])) + if self.contexts: + photo.update(self.photos_getAllContexts(photo["id"])) photo["id"] = text.parse_int(photo["id"]) if "owner" in photo: diff --git a/gallery_dl/extractor/foolfuuka.py b/gallery_dl/extractor/foolfuuka.py index 715abcb7..85dd8969 100644 --- a/gallery_dl/extractor/foolfuuka.py +++ b/gallery_dl/extractor/foolfuuka.py @@ -117,8 +117,8 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex-1) - self.thread = match.group(match.lastindex) + self.board = self.groups[-2] + self.thread = self.groups[-1] self.data = None def metadata(self): @@ -140,20 +140,22 @@ class FoolfuukaThreadExtractor(FoolfuukaExtractor): class FoolfuukaBoardExtractor(FoolfuukaExtractor): """Base extractor for FoolFuuka based boards/archives""" subcategory = "board" - pattern = BASE_PATTERN + r"/([^/?#]+)/\d*$" + pattern = BASE_PATTERN + r"/([^/?#]+)(?:/(?:page/)?(\d*))?$" example = "https://archived.moe/a/" def __init__(self, match): FoolfuukaExtractor.__init__(self, match) - self.board = match.group(match.lastindex) + self.board = self.groups[-2] + self.page = self.groups[-1] def items(self): index_base = "{}/_/api/chan/index/?board={}&page=".format( self.root, self.board) thread_base = "{}/{}/thread/".format(self.root, self.board) - for page in itertools.count(1): - with self.request(index_base + format(page)) as response: + page = self.page + for pnum in itertools.count(text.parse_int(page, 1)): + with self.request(index_base + format(pnum)) as response: try: threads = response.json() except ValueError: @@ -167,6 +169,9 @@ class FoolfuukaBoardExtractor(FoolfuukaExtractor): thread["_extractor"] = FoolfuukaThreadExtractor yield Message.Queue, thread["url"], thread + if page: + return + class FoolfuukaSearchExtractor(FoolfuukaExtractor): """Base extractor for search results on FoolFuuka based boards/archives""" @@ -179,17 +184,16 @@ class FoolfuukaSearchExtractor(FoolfuukaExtractor): def __init__(self, match): FoolfuukaExtractor.__init__(self, match) self.params = params = {} - args = match.group(match.lastindex).split("/") - key = None - for arg in args: + key = None + for arg in self.groups[-1].split("/"): if key: params[key] = text.unescape(arg) key = None else: key = arg - board = match.group(match.lastindex-1) + board = self.groups[-2] if board != "_": params["boards"] = board diff --git a/gallery_dl/extractor/furaffinity.py b/gallery_dl/extractor/furaffinity.py index 56721d0f..6040187e 100644 --- a/gallery_dl/extractor/furaffinity.py +++ b/gallery_dl/extractor/furaffinity.py @@ -11,7 +11,7 @@ from .common import Extractor, Message from .. import text, util -BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?furaffinity\.net" +BASE_PATTERN = r"(?:https?://)?(?:www\.|sfw\.)?(?:f[ux]|f?xfu)raffinity\.net" class FuraffinityExtractor(Extractor): diff --git a/gallery_dl/extractor/gelbooru.py b/gallery_dl/extractor/gelbooru.py index 83f13922..37c776e6 100644 --- a/gallery_dl/extractor/gelbooru.py +++ b/gallery_dl/extractor/gelbooru.py @@ -32,6 +32,9 @@ class GelbooruBase(): url = self.root + "/index.php?page=dapi&q=index&json=1" data = self.request(url, params=params).json() + if not key: + return data + try: posts = data[key] except KeyError: @@ -48,19 +51,44 @@ class GelbooruBase(): params["pid"] = self.page_start params["limit"] = self.per_page limit = self.per_page // 2 + pid = False + + if "tags" in params: + tags = params["tags"].split() + op = "<" + id = False + + for tag in tags: + if tag.startswith("sort:"): + if tag == "sort:id:asc": + op = ">" + elif tag == "sort:id" or tag.startswith("sort:id:"): + op = "<" + else: + pid = True + elif tag.startswith("id:"): + id = True + + if not pid: + if id: + tag = "id:" + op + tags = [t for t in tags if not t.startswith(tag)] + tags = "{} id:{}".format(" ".join(tags), op) while True: posts = self._api_request(params) - for post in posts: - yield post + yield from posts if len(posts) < limit: return - if "pid" in params: - del params["pid"] - params["tags"] = "{} id:<{}".format(self.tags, post["id"]) + if pid: + params["pid"] += 1 + else: + if "pid" in params: + del params["pid"] + params["tags"] = tags + str(posts[-1]["id"]) def _pagination_html(self, params): url = self.root + "/index.php" @@ -167,13 +195,61 @@ class GelbooruFavoriteExtractor(GelbooruBase, params = { "s" : "favorite", "id" : self.favorite_id, - "limit": "1", + "limit": "2", } + data = self._api_request(params, None, True) - count = self._api_request(params, "@attributes", True)[0]["count"] - if count <= self.offset: - return + count = data["@attributes"]["count"] + self.log.debug("API reports %s favorite entries", count) + favs = data["favorite"] + try: + order = 1 if favs[0]["id"] < favs[1]["id"] else -1 + except LookupError as exc: + self.log.debug( + "Error when determining API favorite order (%s: %s)", + exc.__class__.__name__, exc) + order = -1 + else: + self.log.debug("API yields favorites in %sscending order", + "a" if order > 0 else "de") + + order_favs = self.config("order-posts") + if order_favs and order_favs[0] in ("r", "a"): + self.log.debug("Returning them in reverse") + order = -order + + if order < 0: + return self._pagination(params, count) + return self._pagination_reverse(params, count) + + def _pagination(self, params, count): + if self.offset: + pnum, skip = divmod(self.offset, self.per_page) + else: + pnum = skip = 0 + + params["pid"] = pnum + params["limit"] = self.per_page + + while True: + favs = self._api_request(params, "favorite") + + if not favs: + return + + if skip: + favs = favs[skip:] + skip = 0 + + for fav in favs: + for post in self._api_request({"id": fav["favorite"]}): + post["date_favorited"] = text.parse_timestamp(fav["added"]) + yield post + + params["pid"] += 1 + + def _pagination_reverse(self, params, count): pnum, last = divmod(count-1, self.per_page) if self.offset > last: # page number change @@ -182,12 +258,11 @@ class GelbooruFavoriteExtractor(GelbooruBase, pnum -= diff + 1 skip = self.offset - # paginate over them in reverse params["pid"] = pnum params["limit"] = self.per_page while True: - favs = self._api_request(params, "favorite", True) + favs = self._api_request(params, "favorite") favs.reverse() if skip: @@ -195,7 +270,9 @@ class GelbooruFavoriteExtractor(GelbooruBase, skip = 0 for fav in favs: - yield from self._api_request({"id": fav["favorite"]}) + for post in self._api_request({"id": fav["favorite"]}): + post["date_favorited"] = text.parse_timestamp(fav["added"]) + yield post params["pid"] -= 1 if params["pid"] < 0: diff --git a/gallery_dl/extractor/gofile.py b/gallery_dl/extractor/gofile.py index 289f91cb..f0eb4e9c 100644 --- a/gallery_dl/extractor/gofile.py +++ b/gallery_dl/extractor/gofile.py @@ -41,9 +41,13 @@ class GofileFolderExtractor(Extractor): folder = self._get_content(self.content_id, password) yield Message.Directory, folder + try: + contents = folder.pop("children") + except KeyError: + raise exception.AuthorizationError("Password required") + num = 0 - contents = folder.pop("contents") - for content_id in folder["childs"]: + for content_id in folder["childrenIds"]: content = contents[content_id] content["folder"] = folder @@ -67,31 +71,32 @@ class GofileFolderExtractor(Extractor): @memcache() def _create_account(self): self.log.debug("Creating temporary account") - return self._api_request("createAccount")["token"] + return self._api_request("accounts", method="POST")["token"] @cache(maxage=86400) def _get_website_token(self): self.log.debug("Fetching website token") page = self.request(self.root + "/dist/js/alljs.js").text - return text.extr(page, 'fetchData.wt = "', '"') + return text.extr(page, 'wt: "', '"') def _get_content(self, content_id, password=None): + headers = {"Authorization": "Bearer " + self.api_token} + params = {"wt": self.website_token} if password is not None: - password = hashlib.sha256(password.encode()).hexdigest() - return self._api_request("getContent", { - "contentId" : content_id, - "token" : self.api_token, - "wt" : self.website_token, - "password" : password, - }) + params["password"] = hashlib.sha256(password.encode()).hexdigest() + return self._api_request("contents/" + content_id, params, headers) - def _api_request(self, endpoint, params=None): + def _api_request(self, endpoint, params=None, headers=None, method="GET"): response = self.request( - "https://api.gofile.io/" + endpoint, params=params).json() + "https://api.gofile.io/" + endpoint, + method=method, params=params, headers=headers, + ).json() if response["status"] != "ok": if response["status"] == "error-notFound": raise exception.NotFoundError("content") + if response["status"] == "error-passwordRequired": + raise exception.AuthorizationError("Password required") raise exception.StopExtraction( "%s failed (Status: %s)", endpoint, response["status"]) diff --git a/gallery_dl/extractor/hiperdex.py b/gallery_dl/extractor/hiperdex.py index 20491b56..aadce6ca 100644 --- a/gallery_dl/extractor/hiperdex.py +++ b/gallery_dl/extractor/hiperdex.py @@ -25,7 +25,7 @@ class HiperdexBase(): @memcache(keyarg=1) def manga_data(self, manga, page=None): if not page: - url = "{}/manga/{}/".format(self.root, manga) + url = "{}/mangas/{}/".format(self.root, manga) page = self.request(url).text extr = text.extract_from(page) @@ -33,7 +33,7 @@ class HiperdexBase(): "url" : text.unescape(extr( 'property="og:url" content="', '"')), "manga" : text.unescape(extr( - '"headline": "', '"')), + ' property="name" title="', '"')), "score" : text.parse_float(extr( 'id="averagerate">', '<')), "author" : text.remove_html(extr( @@ -68,8 +68,8 @@ class HiperdexBase(): class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): """Extractor for manga chapters from hiperdex.com""" - pattern = BASE_PATTERN + r"(/manga/([^/?#]+)/([^/?#]+))" - example = "https://hiperdex.com/manga/MANGA/CHAPTER/" + pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+)/([^/?#]+))" + example = "https://hiperdex.com/mangas/MANGA/CHAPTER/" def __init__(self, match): root, path, self.manga, self.chapter = match.groups() @@ -90,8 +90,8 @@ class HiperdexChapterExtractor(HiperdexBase, ChapterExtractor): class HiperdexMangaExtractor(HiperdexBase, MangaExtractor): """Extractor for manga from hiperdex.com""" chapterclass = HiperdexChapterExtractor - pattern = BASE_PATTERN + r"(/manga/([^/?#]+))/?$" - example = "https://hiperdex.com/manga/MANGA/" + pattern = BASE_PATTERN + r"(/mangas?/([^/?#]+))/?$" + example = "https://hiperdex.com/mangas/MANGA/" def __init__(self, match): root, path, self.manga = match.groups() diff --git a/gallery_dl/extractor/hotleak.py b/gallery_dl/extractor/hotleak.py index 6d3184d9..a2b51be2 100644 --- a/gallery_dl/extractor/hotleak.py +++ b/gallery_dl/extractor/hotleak.py @@ -23,6 +23,7 @@ class HotleakExtractor(Extractor): def items(self): for post in self.posts(): + post["_http_expected_status"] = (404,) yield Message.Directory, post yield Message.Url, post["url"], post diff --git a/gallery_dl/extractor/idolcomplex.py b/gallery_dl/extractor/idolcomplex.py index c249a3e6..dfd9a317 100644 --- a/gallery_dl/extractor/idolcomplex.py +++ b/gallery_dl/extractor/idolcomplex.py @@ -101,9 +101,8 @@ class IdolcomplexExtractor(SankakuExtractor): page = self.request(url, retries=10).text extr = text.extract_from(page) - pid_alnum = extr('/posts/', '"') - vavg = extr('itemprop="ratingValue">', "<") - vcnt = extr('itemprop="reviewCount">', "<") + vavg = extr('id="rating"', "") + vcnt = extr('>Votes:', "<") pid = extr(">Post ID:", "<") created = extr(' title="', '"') @@ -120,10 +119,10 @@ class IdolcomplexExtractor(SankakuExtractor): rating = extr(">Rating:", "", "") + while True: - extr = text.extract_from(self.request(url, params=params).text) cnt = 0 while True: - gid = extr('", "<") + yield gid, extr("", "<"), folder_name cnt += 1 if cnt < 20: break params["page"] += 1 + extr = text.extract_from(self.request(url, params=params).text) class ImagefapUserExtractor(ImagefapExtractor): diff --git a/gallery_dl/extractor/imgur.py b/gallery_dl/extractor/imgur.py index 8884d3ee..86b1edd4 100644 --- a/gallery_dl/extractor/imgur.py +++ b/gallery_dl/extractor/imgur.py @@ -39,10 +39,15 @@ class ImgurExtractor(Extractor): image["url"] = url = "https://i.imgur.com/{}.{}".format( image["id"], image["ext"]) image["date"] = text.parse_datetime(image["created_at"]) + image["_http_validate"] = self._validate text.nameext_from_url(url, image) return url + def _validate(self, response): + return (not response.history or + not response.url.endswith("/removed.png")) + def _items_queue(self, items): album_ex = ImgurAlbumExtractor image_ex = ImgurImageExtractor diff --git a/gallery_dl/extractor/inkbunny.py b/gallery_dl/extractor/inkbunny.py index 62586af5..2ae8cbe0 100644 --- a/gallery_dl/extractor/inkbunny.py +++ b/gallery_dl/extractor/inkbunny.py @@ -330,15 +330,18 @@ class InkbunnyAPI(): def _call(self, endpoint, params): url = "https://inkbunny.net/api_" + endpoint + ".php" params["sid"] = self.session_id - data = self.extractor.request(url, params=params).json() - if "error_code" in data: + while True: + data = self.extractor.request(url, params=params).json() + + if "error_code" not in data: + return data + if str(data["error_code"]) == "2": self.authenticate(invalidate=True) - return self._call(endpoint, params) - raise exception.StopExtraction(data.get("error_message")) + continue - return data + raise exception.StopExtraction(data.get("error_message")) def _pagination_search(self, params): params["page"] = 1 diff --git a/gallery_dl/extractor/instagram.py b/gallery_dl/extractor/instagram.py index ddc11318..9c2b1de2 100644 --- a/gallery_dl/extractor/instagram.py +++ b/gallery_dl/extractor/instagram.py @@ -165,7 +165,7 @@ class InstagramExtractor(Extractor): data = { "post_id" : post["pk"], "post_shortcode": post["code"], - "likes": post.get("like_count"), + "likes": post.get("like_count", 0), "pinned": post.get("timeline_pinned_user_ids", ()), "date": text.parse_timestamp(post.get("taken_at")), } @@ -736,7 +736,7 @@ class InstagramRestAPI(): not user["followed_by_viewer"]: name = user["username"] s = "" if name.endswith("s") else "s" - raise exception.StopExtraction("%s'%s posts are private", name, s) + self.extractor.log.warning("%s'%s posts are private", name, s) self.extractor._assign_user(user) return user["id"] diff --git a/gallery_dl/extractor/kemonoparty.py b/gallery_dl/extractor/kemonoparty.py index 06dc861e..b0c24de7 100644 --- a/gallery_dl/extractor/kemonoparty.py +++ b/gallery_dl/extractor/kemonoparty.py @@ -41,6 +41,9 @@ class KemonopartyExtractor(Extractor): self.revisions = self.config("revisions") if self.revisions: self.revisions_unique = (self.revisions == "unique") + order = self.config("order-revisions") + self.revisions_reverse = order[0] in ("r", "a") if order else False + self._prepare_ddosguard_cookies() self._find_inline = re.compile( r'src="(?:https?://(?:kemono|coomer)\.(?:su|party))?(/inline/[^"]+' @@ -54,7 +57,7 @@ class KemonopartyExtractor(Extractor): generators = self._build_file_generators(self.config("files")) duplicates = self.config("duplicates") comments = self.config("comments") - username = dms = None + username = dms = announcements = None # prevent files from being sent with gzip compression headers = {"Accept-Encoding": "identity"} @@ -65,6 +68,8 @@ class KemonopartyExtractor(Extractor): '"): - footer = text.extr(dm, "") - dms.append({ + cards = [] + for card in text.extract_iter(page, ""): + footer = text.extr(card, "") + cards.append({ "body": text.unescape(text.extr( - dm, "
", "
", " 19: @@ -232,6 +241,7 @@ class KemonopartyExtractor(Extractor): except exception.HttpError: post["revision_hash"] = self._revision_hash(post) post["revision_index"] = 1 + post["revision_count"] = 1 return (post,) revs.insert(0, post) @@ -247,22 +257,30 @@ class KemonopartyExtractor(Extractor): uniq.append(rev) revs = uniq - idx = len(revs) + cnt = idx = len(revs) for rev in revs: rev["revision_index"] = idx + rev["revision_count"] = cnt idx -= 1 + if self.revisions_reverse: + revs.reverse() + return revs def _revisions_all(self, url): revs = self.request(url + "/revisions").json() - idx = len(revs) + cnt = idx = len(revs) for rev in revs: rev["revision_hash"] = self._revision_hash(rev) rev["revision_index"] = idx + rev["revision_count"] = cnt idx -= 1 + if self.revisions_reverse: + revs.reverse() + return revs def _revision_hash(self, revision): @@ -482,7 +500,8 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): def __init__(self, match): KemonopartyExtractor.__init__(self, match) - self.favorites = (text.parse_query(match.group(3)).get("type") or + self.params = text.parse_query(match.group(3)) + self.favorites = (self.params.get("type") or self.config("favorites") or "artist") @@ -490,9 +509,17 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): self._prepare_ddosguard_cookies() self.login() + sort = self.params.get("sort") + order = self.params.get("order") or "desc" + if self.favorites == "artist": users = self.request( self.root + "/api/v1/account/favorites?type=artist").json() + + if not sort: + sort = "updated" + users.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for user in users: user["_extractor"] = KemonopartyUserExtractor url = "{}/{}/user/{}".format( @@ -502,6 +529,11 @@ class KemonopartyFavoriteExtractor(KemonopartyExtractor): elif self.favorites == "post": posts = self.request( self.root + "/api/v1/account/favorites?type=post").json() + + if not sort: + sort = "faved_seq" + posts.sort(key=lambda x: x[sort], reverse=(order == "desc")) + for post in posts: post["_extractor"] = KemonopartyPostExtractor url = "{}/{}/user/{}/post/{}".format( diff --git a/gallery_dl/extractor/lensdump.py b/gallery_dl/extractor/lensdump.py index d4ccf33b..12e8860c 100644 --- a/gallery_dl/extractor/lensdump.py +++ b/gallery_dl/extractor/lensdump.py @@ -104,7 +104,7 @@ class LensdumpImageExtractor(LensdumpBase, Extractor): filename_fmt = "{category}_{id}{title:?_//}.{extension}" directory_fmt = ("{category}",) archive_fmt = "{id}" - pattern = BASE_PATTERN + r"/i/(\w+)" + pattern = r"(?:https?://)?(?:(?:i\d?\.)?lensdump\.com|\w\.l3n\.co)/i/(\w+)" example = "https://lensdump.com/i/ID" def __init__(self, match): diff --git a/gallery_dl/extractor/mastodon.py b/gallery_dl/extractor/mastodon.py index 68b41961..cb7f701c 100644 --- a/gallery_dl/extractor/mastodon.py +++ b/gallery_dl/extractor/mastodon.py @@ -29,6 +29,7 @@ class MastodonExtractor(BaseExtractor): self.instance = self.root.partition("://")[2] self.reblogs = self.config("reblogs", False) self.replies = self.config("replies", True) + self.cards = self.config("cards", False) def items(self): for status in self.statuses(): @@ -48,6 +49,17 @@ class MastodonExtractor(BaseExtractor): if status["reblog"]: attachments.extend(status["reblog"]["media_attachments"]) + if self.cards: + card = status.get("card") + if card: + url = card.get("image") + if url: + card["weburl"] = card.get("url") + card["url"] = url + card["id"] = "card" + "".join( + url.split("/")[6:-2]).lstrip("0") + attachments.append(card) + status["instance"] = self.instance acct = status["account"]["acct"] status["instance_remote"] = \ @@ -70,7 +82,11 @@ class MastodonExtractor(BaseExtractor): def _check_moved(self, account): self._check_moved = None - if "moved" in account: + # Certain fediverse software (such as Iceshrimp and Sharkey) have a + # null account "moved" field instead of not having it outright. + # To handle this, check if the "moved" value is truthy instead + # if only it exists. + if account.get("moved"): self.log.warning("Account '%s' moved to '%s'", account["acct"], account["moved"]["acct"]) @@ -116,6 +132,7 @@ class MastodonUserExtractor(MastodonExtractor): api.account_id_by_username(self.item), only_media=( not self.reblogs and + not self.cards and not self.config("text-posts", False) ), exclude_replies=not self.replies, @@ -132,6 +149,36 @@ class MastodonBookmarkExtractor(MastodonExtractor): return MastodonAPI(self).account_bookmarks() +class MastodonFavoriteExtractor(MastodonExtractor): + """Extractor for mastodon favorites""" + subcategory = "favorite" + pattern = BASE_PATTERN + r"/favourites" + example = "https://mastodon.social/favourites" + + def statuses(self): + return MastodonAPI(self).account_favorites() + + +class MastodonListExtractor(MastodonExtractor): + """Extractor for mastodon lists""" + subcategory = "list" + pattern = BASE_PATTERN + r"/lists/(\w+)" + example = "https://mastodon.social/lists/12345" + + def statuses(self): + return MastodonAPI(self).timelines_list(self.item) + + +class MastodonHashtagExtractor(MastodonExtractor): + """Extractor for mastodon hashtags""" + subcategory = "hashtag" + pattern = BASE_PATTERN + r"/tags/(\w+)" + example = "https://mastodon.social/tags/NAME" + + def statuses(self): + return MastodonAPI(self).timelines_tag(self.item) + + class MastodonFollowingExtractor(MastodonExtractor): """Extractor for followed mastodon users""" subcategory = "following" @@ -201,37 +248,55 @@ class MastodonAPI(): raise exception.NotFoundError("account") def account_bookmarks(self): + """Statuses the user has bookmarked""" endpoint = "/v1/bookmarks" return self._pagination(endpoint, None) + def account_favorites(self): + """Statuses the user has favourited""" + endpoint = "/v1/favourites" + return self._pagination(endpoint, None) + def account_following(self, account_id): + """Accounts which the given account is following""" endpoint = "/v1/accounts/{}/following".format(account_id) return self._pagination(endpoint, None) def account_lookup(self, username): + """Quickly lookup a username to see if it is available""" endpoint = "/v1/accounts/lookup" params = {"acct": username} return self._call(endpoint, params).json() def account_search(self, query, limit=40): - """Search for accounts""" + """Search for matching accounts by username or display name""" endpoint = "/v1/accounts/search" params = {"q": query, "limit": limit} return self._call(endpoint, params).json() def account_statuses(self, account_id, only_media=True, exclude_replies=False): - """Fetch an account's statuses""" + """Statuses posted to the given account""" endpoint = "/v1/accounts/{}/statuses".format(account_id) - params = {"only_media" : "1" if only_media else "0", - "exclude_replies": "1" if exclude_replies else "0"} + params = {"only_media" : "true" if only_media else "false", + "exclude_replies": "true" if exclude_replies else "false"} return self._pagination(endpoint, params) def status(self, status_id): - """Fetch a status""" + """Obtain information about a status""" endpoint = "/v1/statuses/" + status_id return self._call(endpoint).json() + def timelines_list(self, list_id): + """View statuses in the given list timeline""" + endpoint = "/v1/timelines/list/" + list_id + return self._pagination(endpoint, None) + + def timelines_tag(self, hashtag): + """View public statuses containing the given hashtag""" + endpoint = "/v1/timelines/tag/" + hashtag + return self._pagination(endpoint, None) + def _call(self, endpoint, params=None): if endpoint.startswith("http"): url = endpoint diff --git a/gallery_dl/extractor/naver.py b/gallery_dl/extractor/naver.py index 55faf9e7..d3150e6d 100644 --- a/gallery_dl/extractor/naver.py +++ b/gallery_dl/extractor/naver.py @@ -26,7 +26,8 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): "{post[date]:%Y-%m-%d} {post[title]}") archive_fmt = "{blog[id]}_{post[num]}_{num}" pattern = (r"(?:https?://)?blog\.naver\.com/" - r"(?:PostView\.nhn\?blogId=(\w+)&logNo=(\d+)|(\w+)/(\d+)/?$)") + r"(?:PostView\.n(?:aver|hn)\?blogId=(\w+)&logNo=(\d+)|" + r"(\w+)/(\d+)/?$)") example = "https://blog.naver.com/BLOGID/12345" def __init__(self, match): @@ -46,8 +47,10 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): extr = text.extract_from(page) data = { "post": { - "title" : extr('"og:title" content="', '"'), - "description": extr('"og:description" content="', '"'), + "title" : text.unescape(extr( + '"og:title" content="', '"')), + "description": text.unescape(extr( + '"og:description" content="', '"')).replace(" ", " "), "num" : text.parse_int(self.post_id), }, "blog": { @@ -62,10 +65,13 @@ class NaverPostExtractor(NaverBase, GalleryExtractor): return data def images(self, page): - return [ - (url.replace("://post", "://blog", 1).partition("?")[0], None) - for url in text.extract_iter(page, 'data-lazy-src="', '"') - ] + results = [] + for url in text.extract_iter(page, 'data-lazy-src="', '"'): + url = url.replace("://post", "://blog", 1).partition("?")[0] + if "\ufffd" in text.unquote(url): + url = text.unquote(url, encoding="EUC-KR") + results.append((url, None)) + return results class NaverBlogExtractor(NaverBase, Extractor): @@ -73,7 +79,8 @@ class NaverBlogExtractor(NaverBase, Extractor): subcategory = "blog" categorytransfer = True pattern = (r"(?:https?://)?blog\.naver\.com/" - r"(?:PostList.nhn\?(?:[^&#]+&)*blogId=([^&#]+)|(\w+)/?$)") + r"(?:PostList\.n(?:aver|hn)\?(?:[^&#]+&)*blogId=([^&#]+)|" + r"(\w+)/?$)") example = "https://blog.naver.com/BLOGID" def __init__(self, match): @@ -81,12 +88,11 @@ class NaverBlogExtractor(NaverBase, Extractor): self.blog_id = match.group(1) or match.group(2) def items(self): - # fetch first post number url = "{}/PostList.nhn?blogId={}".format(self.root, self.blog_id) - post_num = text.extract( + post_num = text.extr( self.request(url).text, 'gnFirstLogNo = "', '"', - )[0] + ) # setup params for API calls url = "{}/PostViewBottomTitleListAsync.nhn".format(self.root) diff --git a/gallery_dl/extractor/oauth.py b/gallery_dl/extractor/oauth.py index 8c8a5a99..55715757 100644 --- a/gallery_dl/extractor/oauth.py +++ b/gallery_dl/extractor/oauth.py @@ -110,7 +110,7 @@ class OAuthBase(Extractor): # get a request token params = {"oauth_callback": self.redirect_uri} - data = self.session.get(request_token_url, params=params).text + data = self.request(request_token_url, params=params).text data = text.parse_query(data) self.session.auth.token_secret = data["oauth_token_secret"] @@ -120,7 +120,7 @@ class OAuthBase(Extractor): data = self.open(authorize_url, params) # exchange the request token for an access token - data = self.session.get(access_token_url, params=data).text + data = self.request(access_token_url, params=data).text data = text.parse_query(data) token = data["oauth_token"] token_secret = data["oauth_token_secret"] @@ -189,7 +189,8 @@ class OAuthBase(Extractor): data["client_id"] = client_id data["client_secret"] = client_secret - data = self.session.post(token_url, data=data, auth=auth).json() + data = self.request( + token_url, method="POST", data=data, auth=auth).json() # check token response if "error" in data: @@ -386,7 +387,7 @@ class OAuthMastodon(OAuthBase): "redirect_uris": self.redirect_uri, "scopes": "read", } - data = self.session.post(url, data=data).json() + data = self.request(url, method="POST", data=data).json() if "client_id" not in data or "client_secret" not in data: raise exception.StopExtraction( @@ -441,7 +442,8 @@ class OAuthPixiv(OAuthBase): "redirect_uri" : "https://app-api.pixiv.net" "/web/v1/users/auth/pixiv/callback", } - data = self.session.post(url, headers=headers, data=data).json() + data = self.request( + url, method="POST", headers=headers, data=data).json() if "error" in data: stdout_write("\n{}\n".format(data)) diff --git a/gallery_dl/extractor/pixiv.py b/gallery_dl/extractor/pixiv.py index b9821f23..d732894a 100644 --- a/gallery_dl/extractor/pixiv.py +++ b/gallery_dl/extractor/pixiv.py @@ -104,8 +104,9 @@ class PixivExtractor(Extractor): elif work["page_count"] == 1: url = meta_single_page["original_image_url"] if url == url_sanity: - self.log.debug("Skipping 'sanity_level' warning (%s)", - work["id"]) + self.log.warning( + "Unable to download work %s ('sanity_level' warning)", + work["id"]) continue work["date_url"] = self._date_from_url(url) yield Message.Url, url, text.nameext_from_url(url, work) @@ -619,6 +620,7 @@ class PixivNovelExtractor(PixivExtractor): meta_user = self.config("metadata") meta_bookmark = self.config("metadata-bookmark") embeds = self.config("embeds") + covers = self.config("covers") if embeds: headers = { @@ -650,7 +652,7 @@ class PixivNovelExtractor(PixivExtractor): yield Message.Directory, novel try: - content = self.api.novel_text(novel["id"])["novel_text"] + content = self.api.novel_webview(novel["id"])["text"] except Exception: self.log.warning("Unable to download novel %s", novel["id"]) continue @@ -658,12 +660,25 @@ class PixivNovelExtractor(PixivExtractor): novel["extension"] = "txt" yield Message.Url, "text:" + content, novel + if covers: + path = novel["image_urls"]["large"].partition("/img/")[2] + url = ("https://i.pximg.net/novel-cover-original/img/" + + path.rpartition(".")[0].replace("_master1200", "")) + novel["date_url"] = self._date_from_url(url) + novel["num"] += 1 + novel["suffix"] = "_p{:02}".format(novel["num"]) + novel["_fallback"] = (url + ".png",) + url_jpg = url + ".jpg" + text.nameext_from_url(url_jpg, novel) + yield Message.Url, url_jpg, novel + del novel["_fallback"] + if embeds: desktop = False illusts = {} for marker in text.extract_iter(content, "[", "]"): - if marker.startswith("[jumpuri:If you would like to "): + if marker.startswith("uploadedimage:"): desktop = True elif marker.startswith("pixivimage:"): illusts[marker[11:].partition("-")[0]] = None @@ -918,6 +933,15 @@ class PixivAppAPI(): params = {"novel_id": novel_id} return self._call("/v1/novel/text", params) + def novel_webview(self, novel_id): + params = {"id": novel_id, "viewer_version": "20221031_ai"} + return self._call( + "/webview/v2/novel", params, self._novel_webview_parse) + + def _novel_webview_parse(self, response): + return util.json_loads(text.extr( + response.text, "novel: ", ",\n")) + def search_illust(self, word, sort=None, target=None, duration=None, date_start=None, date_end=None): params = {"word": word, "search_target": target, @@ -962,13 +986,17 @@ class PixivAppAPI(): params = {"illust_id": illust_id} return self._call("/v1/ugoira/metadata", params)["ugoira_metadata"] - def _call(self, endpoint, params=None): + def _call(self, endpoint, params=None, parse=None): url = "https://app-api.pixiv.net" + endpoint while True: self.login() response = self.extractor.request(url, params=params, fatal=False) - data = response.json() + + if parse: + data = parse(response) + else: + data = response.json() if "error" not in data: return data diff --git a/gallery_dl/extractor/poipiku.py b/gallery_dl/extractor/poipiku.py index f42016fc..5cc964a3 100644 --- a/gallery_dl/extractor/poipiku.py +++ b/gallery_dl/extractor/poipiku.py @@ -23,6 +23,10 @@ class PoipikuExtractor(Extractor): archive_fmt = "{post_id}_{num}" request_interval = (0.5, 1.5) + def _init(self): + self.cookies.set( + "POIPIKU_CONTENTS_VIEW_MODE", "1", domain="poipiku.com") + def items(self): password = self.config("password", "") diff --git a/gallery_dl/extractor/pornhub.py b/gallery_dl/extractor/pornhub.py index 7ff40a37..c7283fcd 100644 --- a/gallery_dl/extractor/pornhub.py +++ b/gallery_dl/extractor/pornhub.py @@ -143,6 +143,9 @@ class PornhubGifExtractor(PornhubExtractor): "url" : extr('"contentUrl": "', '"'), "date" : text.parse_datetime( extr('"uploadDate": "', '"'), "%Y-%m-%d"), + "viewkey" : extr('From this video: ' + '
', '<'), "user" : text.remove_html(extr("Created by:", "")), } diff --git a/gallery_dl/extractor/readcomiconline.py b/gallery_dl/extractor/readcomiconline.py index 35698605..115de9a2 100644 --- a/gallery_dl/extractor/readcomiconline.py +++ b/gallery_dl/extractor/readcomiconline.py @@ -35,10 +35,7 @@ class ReadcomiconlineBase(): self.log.warning( "Redirect to \n%s\nVisit this URL in your browser, solve " "the CAPTCHA, and press ENTER to continue", response.url) - try: - input() - except (EOFError, OSError): - pass + self.input() else: raise exception.StopExtraction( "Redirect to \n%s\nVisit this URL in your browser and " diff --git a/gallery_dl/extractor/reddit.py b/gallery_dl/extractor/reddit.py index 2ef0f9fb..ce602f6c 100644 --- a/gallery_dl/extractor/reddit.py +++ b/gallery_dl/extractor/reddit.py @@ -74,8 +74,8 @@ class RedditExtractor(Extractor): yield Message.Url, url, submission elif "gallery_data" in media: - for submission["num"], url in enumerate( - self._extract_gallery(media), 1): + for url in self._extract_gallery(media): + submission["num"] += 1 text.nameext_from_url(url, submission) yield Message.Url, url, submission @@ -99,7 +99,10 @@ class RedditExtractor(Extractor): urls.append((url, submission)) for comment in comments: html = comment["body_html"] or "" - if ' href="' in html: + href = (' href="' in html) + media = ("media_metadata" in comment) + + if media or href: comment["date"] = text.parse_timestamp( comment["created_utc"]) if submission: @@ -107,6 +110,14 @@ class RedditExtractor(Extractor): data["comment"] = comment else: data = comment + + if media: + for embed in self._extract_embed(comment): + submission["num"] += 1 + text.nameext_from_url(embed, submission) + yield Message.Url, embed, submission + + if href: for url in text.extract_iter(html, ' href="', '"'): urls.append((url, data)) @@ -118,6 +129,7 @@ class RedditExtractor(Extractor): if url.startswith(( "https://www.reddit.com/message/compose", "https://reddit.com/message/compose", + "https://preview.redd.it/", )): continue @@ -172,6 +184,27 @@ class RedditExtractor(Extractor): submission["id"], item["media_id"]) self.log.debug(src) + def _extract_embed(self, submission): + meta = submission["media_metadata"] + if not meta: + return + + for mid, data in meta.items(): + if data["status"] != "valid" or "s" not in data: + self.log.warning( + "embed %s: skipping item %s (status: %s)", + submission["id"], mid, data.get("status")) + continue + src = data["s"] + url = src.get("u") or src.get("gif") or src.get("mp4") + if url: + yield url.partition("?")[0].replace("/preview.", "/i.", 1) + else: + self.log.error( + "embed %s: unable to fetch download URL for item %s", + submission["id"], mid) + self.log.debug(src) + def _extract_video_ytdl(self, submission): return "https://www.reddit.com" + submission["permalink"] @@ -191,6 +224,8 @@ class RedditExtractor(Extractor): try: if "reddit_video_preview" in post["preview"]: video = post["preview"]["reddit_video_preview"] + if "fallback_url" in video: + yield video["fallback_url"] if "dash_url" in video: yield "ytdl:" + video["dash_url"] if "hls_url" in video: @@ -200,6 +235,12 @@ class RedditExtractor(Extractor): try: for image in post["preview"]["images"]: + variants = image.get("variants") + if variants: + if "gif" in variants: + yield variants["gif"]["source"]["url"] + if "mp4" in variants: + yield variants["mp4"]["source"]["url"] yield image["source"]["url"] except Exception as exc: self.log.debug("%s: %s", exc.__class__.__name__, exc) @@ -446,14 +487,14 @@ class RedditAPI(): remaining = response.headers.get("x-ratelimit-remaining") if remaining and float(remaining) < 2: - if self._warn_429: - self._warn_429 = False + self.log.warning("API rate limit exceeded") + if self._warn_429 and self.client_id == self.CLIENT_ID: self.log.info( "Register your own OAuth application and use its " "credentials to prevent this error: " - "https://github.com/mikf/gallery-dl/blob/master" - "/docs/configuration.rst" - "#extractorredditclient-id--user-agent") + "https://gdl-org.github.io/docs/configuration.html" + "#extractor-reddit-client-id-user-agent") + self._warn_429 = False self.extractor.wait( seconds=response.headers["x-ratelimit-reset"]) continue diff --git a/gallery_dl/extractor/skeb.py b/gallery_dl/extractor/skeb.py index b9b4b3c4..38a2d166 100644 --- a/gallery_dl/extractor/skeb.py +++ b/gallery_dl/extractor/skeb.py @@ -26,10 +26,10 @@ class SkebExtractor(Extractor): def _init(self): self.thumbnails = self.config("thumbnails", False) self.article = self.config("article", False) - self.headers = { - "Accept" : "application/json, text/plain, */*", - "Authorization": "Bearer null", - } + self.headers = {"Accept": "application/json, text/plain, */*"} + + if "Authorization" not in self.session.headers: + self.headers["Authorization"] = "Bearer null" def request(self, url, **kwargs): while True: @@ -55,6 +55,12 @@ class SkebExtractor(Extractor): url = file["file_url"] yield Message.Url, url, text.nameext_from_url(url, post) + def _items_users(self): + base = self.root + "/@" + for user in self.users(): + user["_extractor"] = SkebUserExtractor + yield Message.Queue, base + user["screen_name"], user + def posts(self): """Return post number""" @@ -83,6 +89,20 @@ class SkebExtractor(Extractor): return params["offset"] += 30 + def _pagination_users(self, endpoint, params): + url = "{}/api{}".format(self.root, endpoint) + params["offset"] = 0 + params["limit"] = 90 + + while True: + data = self.request( + url, params=params, headers=self.headers).json() + yield from data + + if len(data) < params["limit"]: + return + params["offset"] += params["limit"] + def _get_post_data(self, user_name, post_num): url = "{}/api/users/{}/works/{}".format( self.root, user_name, post_num) @@ -256,22 +276,23 @@ class SkebFollowingExtractor(SkebExtractor): pattern = r"(?:https?://)?skeb\.jp/@([^/?#]+)/following_creators" example = "https://skeb.jp/@USER/following_creators" - def items(self): - for user in self.users(): - url = "{}/@{}".format(self.root, user["screen_name"]) - user["_extractor"] = SkebUserExtractor - yield Message.Queue, url, user + items = SkebExtractor._items_users def users(self): - url = "{}/api/users/{}/following_creators".format( - self.root, self.user_name) - params = {"sort": "date", "offset": 0, "limit": 90} + endpoint = "/users/{}/following_creators".format(self.user_name) + params = {"sort": "date"} + return self._pagination_users(endpoint, params) - while True: - data = self.request( - url, params=params, headers=self.headers).json() - yield from data - if len(data) < params["limit"]: - return - params["offset"] += params["limit"] +class SkebFollowingUsersExtractor(SkebExtractor): + """Extractor for your followed users""" + subcategory = "following-users" + pattern = r"(?:https?://)?skeb\.jp/following_users()" + example = "https://skeb.jp/following_users" + + items = SkebExtractor._items_users + + def users(self): + endpoint = "/following_users" + params = {} + return self._pagination_users(endpoint, params) diff --git a/gallery_dl/extractor/steamgriddb.py b/gallery_dl/extractor/steamgriddb.py index 9d46fd6b..85828247 100644 --- a/gallery_dl/extractor/steamgriddb.py +++ b/gallery_dl/extractor/steamgriddb.py @@ -163,6 +163,9 @@ class SteamgriddbAssetExtractor(SteamgriddbExtractor): def assets(self): endpoint = "/api/public/asset/" + self.asset_type + "/" + self.asset_id asset = self._call(endpoint)["asset"] + if asset is None: + raise exception.NotFoundError("asset ({}:{})".format( + self.asset_type, self.asset_id)) return (asset,) diff --git a/gallery_dl/extractor/subscribestar.py b/gallery_dl/extractor/subscribestar.py index 31fb891a..d4adfed9 100644 --- a/gallery_dl/extractor/subscribestar.py +++ b/gallery_dl/extractor/subscribestar.py @@ -175,7 +175,7 @@ class SubscribestarPostExtractor(SubscribestarExtractor): "author_id" : text.parse_int(extr('data-user-id="', '"')), "author_nick": text.unescape(extr('alt="', '"')), "date" : self._parse_datetime(extr( - 'class="section-subtitle">', '<')), + '', '<')), "content" : (extr( '
")[2]), diff --git a/gallery_dl/extractor/tapas.py b/gallery_dl/extractor/tapas.py index 0a9df20c..167953d2 100644 --- a/gallery_dl/extractor/tapas.py +++ b/gallery_dl/extractor/tapas.py @@ -151,3 +151,18 @@ class TapasEpisodeExtractor(TapasExtractor): def episode_ids(self): return (self.episode_id,) + + +class TapasCreatorExtractor(TapasExtractor): + subcategory = "creator" + pattern = BASE_PATTERN + r"/(?!series|episode)([^/?#]+)" + example = "https://tapas.io/CREATOR" + + def items(self): + url = "{}/{}/series".format(self.root, self.groups[0]) + page = self.request(url).text + page = text.extr(page, '