mirror of
https://github.com/mikf/gallery-dl.git
synced 2024-11-22 10:42:34 +01:00
filter duplicate URLs (#36)
Duplicate URLs might occur if, for example, an artist adds another image to his gallery while an extractor is running and images are being downloaded on sites like pixiv/nijie/hentaifoundry. The next image on the next page will have already been downloaded and will cause a premature end if '--abort-on-skip' is being used.
This commit is contained in:
parent
00420ff202
commit
268cfa3cfe
@ -25,17 +25,26 @@ class Job():
|
||||
self.extractor.log.debug(
|
||||
"Using %s for %s", self.extractor.__class__.__name__, url)
|
||||
|
||||
# url predicates
|
||||
predicates = [util.UniquePredicate()]
|
||||
|
||||
items = config.get(("images",))
|
||||
if items:
|
||||
pred = util.RangePredicate(items)
|
||||
if pred.lower > 1:
|
||||
pred.index += self.extractor.skip(pred.lower - 1)
|
||||
self.pred_url = pred
|
||||
else:
|
||||
self.pred_url = True
|
||||
predicates.append(pred)
|
||||
|
||||
self.pred_url = util.build_predicate(predicates)
|
||||
|
||||
# queue predicates
|
||||
predicates = []
|
||||
|
||||
items = config.get(("chapters",))
|
||||
self.pred_queue = util.RangePredicate(items) if items else True
|
||||
if items:
|
||||
predicates.append(util.RangePredicate(items))
|
||||
|
||||
self.pred_queue = util.build_predicate(predicates)
|
||||
|
||||
def run(self):
|
||||
"""Execute or run the job"""
|
||||
@ -73,16 +82,17 @@ class Job():
|
||||
def dispatch(self, msg):
|
||||
"""Call the appropriate message handler"""
|
||||
if msg[0] == Message.Url:
|
||||
if self.pred_url:
|
||||
self.update_kwdict(msg[2])
|
||||
self.handle_url(msg[1], msg[2])
|
||||
_, url, kwds = msg
|
||||
if self.pred_url(url, kwds):
|
||||
self.update_kwdict(kwds)
|
||||
self.handle_url(url, kwds)
|
||||
|
||||
elif msg[0] == Message.Directory:
|
||||
self.update_kwdict(msg[1])
|
||||
self.handle_directory(msg[1])
|
||||
|
||||
elif msg[0] == Message.Queue:
|
||||
if self.pred_queue:
|
||||
if self.pred_queue(msg[1], None):
|
||||
self.handle_queue(msg[1])
|
||||
|
||||
elif msg[0] == Message.Version:
|
||||
|
@ -139,8 +139,17 @@ CODES = {
|
||||
SPECIAL_EXTRACTORS = ("oauth", "recursive", "test")
|
||||
|
||||
|
||||
def build_predicate(predicates):
|
||||
if not predicates:
|
||||
return lambda url, kwds: True
|
||||
elif len(predicates) == 1:
|
||||
return predicates[0]
|
||||
else:
|
||||
return ChainPredicate(predicates)
|
||||
|
||||
|
||||
class RangePredicate():
|
||||
"""Predicate; is True if the current index is in the given range"""
|
||||
"""Predicate; True if the current index is in the given range"""
|
||||
def __init__(self, rangespec):
|
||||
self.ranges = optimize_range(parse_range(rangespec))
|
||||
self.index = 0
|
||||
@ -149,7 +158,7 @@ class RangePredicate():
|
||||
else:
|
||||
self.lower, self.upper = 0, 0
|
||||
|
||||
def __bool__(self):
|
||||
def __call__(self, url, kwds):
|
||||
self.index += 1
|
||||
|
||||
if self.index > self.upper:
|
||||
@ -161,6 +170,30 @@ class RangePredicate():
|
||||
return False
|
||||
|
||||
|
||||
class UniquePredicate():
|
||||
"""Predicate; True if given URL has not been encountered before"""
|
||||
def __init__(self):
|
||||
self.urls = set()
|
||||
|
||||
def __call__(self, url, kwds):
|
||||
if url not in self.urls:
|
||||
self.urls.add(url)
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
class ChainPredicate():
|
||||
"""Predicate; True if all of its predicates return True"""
|
||||
def __init__(self, predicates):
|
||||
self.predicates = predicates
|
||||
|
||||
def __call__(self, url, kwds):
|
||||
for pred in self.predicates:
|
||||
if not pred(url, kwds):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
class PathFormat():
|
||||
|
||||
def __init__(self, extractor):
|
||||
|
Loading…
Reference in New Issue
Block a user