1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 18:53:21 +01:00

[twitter] implement 'cursor' support (#5753)

This commit is contained in:
Mike Fährmann 2024-07-04 23:44:52 +02:00
parent 162756b684
commit 97a50a23d2
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 103 additions and 29 deletions

View File

@ -22,7 +22,10 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
# on release commits, run only for tag event # on release commits, run only for tag event
if: ${{ ! startsWith( github.event.head_commit.message , 'release version ' ) || startsWith( github.ref , 'refs/tags/v' ) }} if: |
github.repository == 'mikf/gallery-dl' &&
( ! startsWith( github.event.head_commit.message , 'release version ' ) ||
startsWith( github.ref , 'refs/tags/v' ) )
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v4

View File

@ -14,6 +14,7 @@ env:
jobs: jobs:
build: build:
if: github.repository == 'mikf/gallery-dl'
runs-on: ${{ matrix.os }} runs-on: ${{ matrix.os }}
defaults: defaults:
run: run:

View File

@ -20,6 +20,7 @@ concurrency:
jobs: jobs:
dispatch: dispatch:
if: github.repository == 'mikf/gallery-dl'
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:

View File

@ -51,6 +51,8 @@ class TwitterExtractor(Extractor):
if not self.config("transform", True): if not self.config("transform", True):
self._transform_user = util.identity self._transform_user = util.identity
self._transform_tweet = util.identity self._transform_tweet = util.identity
self._cursor = None
self._user = None self._user = None
self._user_obj = None self._user_obj = None
self._user_cache = {} self._user_cache = {}
@ -501,6 +503,14 @@ class TwitterExtractor(Extractor):
}, },
} }
def _init_cursor(self):
return self.config("cursor") or None
def _update_cursor(self, cursor):
self.log.debug("Cursor: %s", cursor)
self._cursor = cursor
return cursor
def metadata(self): def metadata(self):
"""Return general metadata""" """Return general metadata"""
return {} return {}
@ -508,6 +518,11 @@ class TwitterExtractor(Extractor):
def tweets(self): def tweets(self):
"""Yield all relevant tweet objects""" """Yield all relevant tweet objects"""
def finalize(self):
if self._cursor:
self.log.info("Use '-o cursor=%s' to continue downloading "
"from the current position", self._cursor)
def login(self): def login(self):
if self.cookies_check(self.cookies_names): if self.cookies_check(self.cookies_names):
return return
@ -539,6 +554,9 @@ class TwitterUserExtractor(TwitterExtractor):
def initialize(self): def initialize(self):
pass pass
def finalize(self):
pass
def items(self): def items(self):
base = "{}/{}/".format(self.root, self.user) base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors(( return self._dispatch_extractors((
@ -558,30 +576,76 @@ class TwitterTimelineExtractor(TwitterExtractor):
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)" pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)"
example = "https://x.com/USER/timeline" example = "https://x.com/USER/timeline"
def _init_cursor(self):
if self._cursor:
return self._cursor.partition("/")[2] or None
return None
def _update_cursor(self, cursor):
if cursor:
self._cursor = self._cursor_prefix + cursor
self.log.debug("Cursor: %s", self._cursor)
else:
self._cursor = None
return cursor
def tweets(self): def tweets(self):
# yield initial batch of (media) tweets self._cursor = cursor = self.config("cursor") or None
tweet = None reset = False
for tweet in self._select_tweet_source()(self.user):
yield tweet if cursor:
if tweet is None: state = cursor.partition("/")[0]
return state, _, tweet_id = state.partition("_")
state = text.parse_int(state, 1)
else:
state = 1
if state <= 1:
self._cursor_prefix = "1/"
# yield initial batch of (media) tweets
tweet = None
for tweet in self._select_tweet_source()(self.user):
yield tweet
if tweet is None and not cursor:
return
user = self._user["name"]
tweet_id = tweet["rest_id"]
state = reset = 2
else:
user = self.user
# build search query # build search query
query = "from:{} max_id:{}".format( query = "from:{} max_id:{}".format(user, tweet_id)
self._user["name"], tweet["rest_id"])
if self.retweets: if self.retweets:
query += " include:retweets include:nativeretweets" query += " include:retweets include:nativeretweets"
if not self.textonly: if state <= 2:
# try to search for media-only tweets self._cursor_prefix = "2_{}/".format(tweet_id)
tweet = None if reset:
for tweet in self.api.search_timeline(query + " filter:links"): self._cursor = self._cursor_prefix
yield tweet
if tweet is not None:
return
# yield unfiltered search results if not self.textonly:
yield from self.api.search_timeline(query) # try to search for media-only tweets
tweet = None
for tweet in self.api.search_timeline(query + " filter:links"):
yield tweet
break
if tweet is not None:
return self._update_cursor(None)
state = reset = 3
if state <= 3:
# yield unfiltered search results
self._cursor_prefix = "3_{}/".format(tweet_id)
if reset:
self._cursor = self._cursor_prefix
yield from self.api.search_timeline(query)
return self._update_cursor(None)
def _select_tweet_source(self): def _select_tweet_source(self):
strategy = self.config("strategy") strategy = self.config("strategy")
@ -1415,7 +1479,9 @@ class TwitterAPI():
"%s %s (%s)", response.status_code, response.reason, errors) "%s %s (%s)", response.status_code, response.reason, errors)
def _pagination_legacy(self, endpoint, params): def _pagination_legacy(self, endpoint, params):
original_retweets = (self.extractor.retweets == "original") extr = self.extractor
params["cursor"] = extr._init_cursor()
original_retweets = (extr.retweets == "original")
bottom = ("cursor-bottom-", "sq-cursor-bottom") bottom = ("cursor-bottom-", "sq-cursor-bottom")
while True: while True:
@ -1423,7 +1489,7 @@ class TwitterAPI():
instructions = data["timeline"]["instructions"] instructions = data["timeline"]["instructions"]
if not instructions: if not instructions:
return return extr._update_cursor(None)
tweets = data["globalObjects"]["tweets"] tweets = data["globalObjects"]["tweets"]
users = data["globalObjects"]["users"] users = data["globalObjects"]["users"]
@ -1504,8 +1570,8 @@ class TwitterAPI():
# stop on empty response # stop on empty response
if not cursor or (not tweets and not tweet_id): if not cursor or (not tweets and not tweet_id):
return return extr._update_cursor(None)
params["cursor"] = cursor params["cursor"] = extr._update_cursor(cursor)
def _pagination_tweets(self, endpoint, variables, def _pagination_tweets(self, endpoint, variables,
path=None, stop_tweets=True, features=None): path=None, stop_tweets=True, features=None):
@ -1514,6 +1580,7 @@ class TwitterAPI():
pinned_tweet = extr.pinned pinned_tweet = extr.pinned
params = {"variables": None} params = {"variables": None}
variables["cursor"] = extr._init_cursor()
if features is None: if features is None:
features = self.features_pagination features = self.features_pagination
if features: if features:
@ -1550,7 +1617,7 @@ class TwitterAPI():
cursor = entry["content"]["value"] cursor = entry["content"]["value"]
if entries is None: if entries is None:
if not cursor: if not cursor:
return return extr._update_cursor(None)
entries = () entries = ()
except LookupError: except LookupError:
@ -1699,12 +1766,14 @@ class TwitterAPI():
continue continue
if stop_tweets and not tweet: if stop_tweets and not tweet:
return return extr._update_cursor(None)
if not cursor or cursor == variables.get("cursor"): if not cursor or cursor == variables.get("cursor"):
return return extr._update_cursor(None)
variables["cursor"] = cursor variables["cursor"] = extr._update_cursor(cursor)
def _pagination_users(self, endpoint, variables, path=None): def _pagination_users(self, endpoint, variables, path=None):
extr = self.extractor
variables["cursor"] = extr._init_cursor()
params = { params = {
"variables": None, "variables": None,
"features" : self._json_dumps(self.features_pagination), "features" : self._json_dumps(self.features_pagination),
@ -1724,7 +1793,7 @@ class TwitterAPI():
data = data[key] data = data[key]
instructions = data["instructions"] instructions = data["instructions"]
except KeyError: except KeyError:
return return extr._update_cursor(None)
for instr in instructions: for instr in instructions:
if instr["type"] == "TimelineAddEntries": if instr["type"] == "TimelineAddEntries":
@ -1742,8 +1811,8 @@ class TwitterAPI():
cursor = entry["content"]["value"] cursor = entry["content"]["value"]
if not cursor or cursor.startswith(("-1|", "0|")) or not entry: if not cursor or cursor.startswith(("-1|", "0|")) or not entry:
return return extr._update_cursor(None)
variables["cursor"] = cursor variables["cursor"] = extr._update_cursor(cursor)
def _handle_ratelimit(self, response): def _handle_ratelimit(self, response):
rl = self.extractor.config("ratelimit") rl = self.extractor.config("ratelimit")