1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 10:42:34 +01:00

[twitter] implement 'cursor' support (#5753)

This commit is contained in:
Mike Fährmann 2024-07-04 23:44:52 +02:00
parent 162756b684
commit 97a50a23d2
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
4 changed files with 103 additions and 29 deletions

View File

@ -22,7 +22,10 @@ jobs:
runs-on: ubuntu-latest
# on release commits, run only for tag event
if: ${{ ! startsWith( github.event.head_commit.message , 'release version ' ) || startsWith( github.ref , 'refs/tags/v' ) }}
if: |
github.repository == 'mikf/gallery-dl' &&
( ! startsWith( github.event.head_commit.message , 'release version ' ) ||
startsWith( github.ref , 'refs/tags/v' ) )
steps:
- uses: actions/checkout@v4

View File

@ -14,6 +14,7 @@ env:
jobs:
build:
if: github.repository == 'mikf/gallery-dl'
runs-on: ${{ matrix.os }}
defaults:
run:

View File

@ -20,6 +20,7 @@ concurrency:
jobs:
dispatch:
if: github.repository == 'mikf/gallery-dl'
runs-on: ubuntu-latest
steps:

View File

@ -51,6 +51,8 @@ class TwitterExtractor(Extractor):
if not self.config("transform", True):
self._transform_user = util.identity
self._transform_tweet = util.identity
self._cursor = None
self._user = None
self._user_obj = None
self._user_cache = {}
@ -501,6 +503,14 @@ class TwitterExtractor(Extractor):
},
}
def _init_cursor(self):
return self.config("cursor") or None
def _update_cursor(self, cursor):
self.log.debug("Cursor: %s", cursor)
self._cursor = cursor
return cursor
def metadata(self):
"""Return general metadata"""
return {}
@ -508,6 +518,11 @@ class TwitterExtractor(Extractor):
def tweets(self):
"""Yield all relevant tweet objects"""
def finalize(self):
if self._cursor:
self.log.info("Use '-o cursor=%s' to continue downloading "
"from the current position", self._cursor)
def login(self):
if self.cookies_check(self.cookies_names):
return
@ -539,6 +554,9 @@ class TwitterUserExtractor(TwitterExtractor):
def initialize(self):
pass
def finalize(self):
pass
def items(self):
base = "{}/{}/".format(self.root, self.user)
return self._dispatch_extractors((
@ -558,30 +576,76 @@ class TwitterTimelineExtractor(TwitterExtractor):
pattern = BASE_PATTERN + r"/(?!search)([^/?#]+)/timeline(?!\w)"
example = "https://x.com/USER/timeline"
def _init_cursor(self):
if self._cursor:
return self._cursor.partition("/")[2] or None
return None
def _update_cursor(self, cursor):
if cursor:
self._cursor = self._cursor_prefix + cursor
self.log.debug("Cursor: %s", self._cursor)
else:
self._cursor = None
return cursor
def tweets(self):
# yield initial batch of (media) tweets
tweet = None
for tweet in self._select_tweet_source()(self.user):
yield tweet
if tweet is None:
return
self._cursor = cursor = self.config("cursor") or None
reset = False
if cursor:
state = cursor.partition("/")[0]
state, _, tweet_id = state.partition("_")
state = text.parse_int(state, 1)
else:
state = 1
if state <= 1:
self._cursor_prefix = "1/"
# yield initial batch of (media) tweets
tweet = None
for tweet in self._select_tweet_source()(self.user):
yield tweet
if tweet is None and not cursor:
return
user = self._user["name"]
tweet_id = tweet["rest_id"]
state = reset = 2
else:
user = self.user
# build search query
query = "from:{} max_id:{}".format(
self._user["name"], tweet["rest_id"])
query = "from:{} max_id:{}".format(user, tweet_id)
if self.retweets:
query += " include:retweets include:nativeretweets"
if not self.textonly:
# try to search for media-only tweets
tweet = None
for tweet in self.api.search_timeline(query + " filter:links"):
yield tweet
if tweet is not None:
return
if state <= 2:
self._cursor_prefix = "2_{}/".format(tweet_id)
if reset:
self._cursor = self._cursor_prefix
# yield unfiltered search results
yield from self.api.search_timeline(query)
if not self.textonly:
# try to search for media-only tweets
tweet = None
for tweet in self.api.search_timeline(query + " filter:links"):
yield tweet
break
if tweet is not None:
return self._update_cursor(None)
state = reset = 3
if state <= 3:
# yield unfiltered search results
self._cursor_prefix = "3_{}/".format(tweet_id)
if reset:
self._cursor = self._cursor_prefix
yield from self.api.search_timeline(query)
return self._update_cursor(None)
def _select_tweet_source(self):
strategy = self.config("strategy")
@ -1415,7 +1479,9 @@ class TwitterAPI():
"%s %s (%s)", response.status_code, response.reason, errors)
def _pagination_legacy(self, endpoint, params):
original_retweets = (self.extractor.retweets == "original")
extr = self.extractor
params["cursor"] = extr._init_cursor()
original_retweets = (extr.retweets == "original")
bottom = ("cursor-bottom-", "sq-cursor-bottom")
while True:
@ -1423,7 +1489,7 @@ class TwitterAPI():
instructions = data["timeline"]["instructions"]
if not instructions:
return
return extr._update_cursor(None)
tweets = data["globalObjects"]["tweets"]
users = data["globalObjects"]["users"]
@ -1504,8 +1570,8 @@ class TwitterAPI():
# stop on empty response
if not cursor or (not tweets and not tweet_id):
return
params["cursor"] = cursor
return extr._update_cursor(None)
params["cursor"] = extr._update_cursor(cursor)
def _pagination_tweets(self, endpoint, variables,
path=None, stop_tweets=True, features=None):
@ -1514,6 +1580,7 @@ class TwitterAPI():
pinned_tweet = extr.pinned
params = {"variables": None}
variables["cursor"] = extr._init_cursor()
if features is None:
features = self.features_pagination
if features:
@ -1550,7 +1617,7 @@ class TwitterAPI():
cursor = entry["content"]["value"]
if entries is None:
if not cursor:
return
return extr._update_cursor(None)
entries = ()
except LookupError:
@ -1699,12 +1766,14 @@ class TwitterAPI():
continue
if stop_tweets and not tweet:
return
return extr._update_cursor(None)
if not cursor or cursor == variables.get("cursor"):
return
variables["cursor"] = cursor
return extr._update_cursor(None)
variables["cursor"] = extr._update_cursor(cursor)
def _pagination_users(self, endpoint, variables, path=None):
extr = self.extractor
variables["cursor"] = extr._init_cursor()
params = {
"variables": None,
"features" : self._json_dumps(self.features_pagination),
@ -1724,7 +1793,7 @@ class TwitterAPI():
data = data[key]
instructions = data["instructions"]
except KeyError:
return
return extr._update_cursor(None)
for instr in instructions:
if instr["type"] == "TimelineAddEntries":
@ -1742,8 +1811,8 @@ class TwitterAPI():
cursor = entry["content"]["value"]
if not cursor or cursor.startswith(("-1|", "0|")) or not entry:
return
variables["cursor"] = cursor
return extr._update_cursor(None)
variables["cursor"] = extr._update_cursor(cursor)
def _handle_ratelimit(self, response):
rl = self.extractor.config("ratelimit")