1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2024-11-22 18:53:21 +01:00

filter duplicate URLs (#36)

Duplicate URLs might occur if, for example,  an artist adds another
image to his gallery while an extractor is running and images are being
downloaded on sites like pixiv/nijie/hentaifoundry.
The next image on the next page will have already been downloaded and
will cause a premature end if '--abort-on-skip' is being used.
This commit is contained in:
Mike Fährmann 2017-09-06 17:08:50 +02:00
parent 00420ff202
commit 268cfa3cfe
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88
2 changed files with 53 additions and 10 deletions

View File

@ -25,17 +25,26 @@ class Job():
self.extractor.log.debug(
"Using %s for %s", self.extractor.__class__.__name__, url)
# url predicates
predicates = [util.UniquePredicate()]
items = config.get(("images",))
if items:
pred = util.RangePredicate(items)
if pred.lower > 1:
pred.index += self.extractor.skip(pred.lower - 1)
self.pred_url = pred
else:
self.pred_url = True
predicates.append(pred)
self.pred_url = util.build_predicate(predicates)
# queue predicates
predicates = []
items = config.get(("chapters",))
self.pred_queue = util.RangePredicate(items) if items else True
if items:
predicates.append(util.RangePredicate(items))
self.pred_queue = util.build_predicate(predicates)
def run(self):
"""Execute or run the job"""
@ -73,16 +82,17 @@ class Job():
def dispatch(self, msg):
"""Call the appropriate message handler"""
if msg[0] == Message.Url:
if self.pred_url:
self.update_kwdict(msg[2])
self.handle_url(msg[1], msg[2])
_, url, kwds = msg
if self.pred_url(url, kwds):
self.update_kwdict(kwds)
self.handle_url(url, kwds)
elif msg[0] == Message.Directory:
self.update_kwdict(msg[1])
self.handle_directory(msg[1])
elif msg[0] == Message.Queue:
if self.pred_queue:
if self.pred_queue(msg[1], None):
self.handle_queue(msg[1])
elif msg[0] == Message.Version:

View File

@ -139,8 +139,17 @@ CODES = {
SPECIAL_EXTRACTORS = ("oauth", "recursive", "test")
def build_predicate(predicates):
if not predicates:
return lambda url, kwds: True
elif len(predicates) == 1:
return predicates[0]
else:
return ChainPredicate(predicates)
class RangePredicate():
"""Predicate; is True if the current index is in the given range"""
"""Predicate; True if the current index is in the given range"""
def __init__(self, rangespec):
self.ranges = optimize_range(parse_range(rangespec))
self.index = 0
@ -149,7 +158,7 @@ class RangePredicate():
else:
self.lower, self.upper = 0, 0
def __bool__(self):
def __call__(self, url, kwds):
self.index += 1
if self.index > self.upper:
@ -161,6 +170,30 @@ class RangePredicate():
return False
class UniquePredicate():
"""Predicate; True if given URL has not been encountered before"""
def __init__(self):
self.urls = set()
def __call__(self, url, kwds):
if url not in self.urls:
self.urls.add(url)
return True
return False
class ChainPredicate():
"""Predicate; True if all of its predicates return True"""
def __init__(self, predicates):
self.predicates = predicates
def __call__(self, url, kwds):
for pred in self.predicates:
if not pred(url, kwds):
return False
return True
class PathFormat():
def __init__(self, extractor):