1
0
mirror of https://github.com/mikf/gallery-dl.git synced 2025-01-31 11:41:35 +01:00

[shopify] use alternate regex for products on collection pages

when the first on doesn't yield any results
This commit is contained in:
Mike Fährmann 2020-08-15 18:24:14 +02:00
parent 7619152988
commit d06ad148c7
No known key found for this signature in database
GPG Key ID: 5680CA389D365A88

View File

@ -74,21 +74,33 @@ class ShopifyCollectionExtractor(ShopifyExtractor):
def products(self):
params = text.parse_query(self.params)
params["page"] = text.parse_int(params.get("page"), 1)
search_re = re.compile(r"/collections/[\w-]+/products/[\w-]+")
fetch = True
last = None
while True:
page = self.request(self.item_url, params=params).text
urls = search_re.findall(page)
last = None
for pattern in (
r"/collections/[\w-]+/products/[\w-]+",
r"href=[\"'](/products/[\w-]+)",
):
search_re = re.compile(pattern)
if not urls:
return
for path in urls:
if last == path:
continue
last = path
yield self.root + path
params["page"] += 1
while True:
if fetch:
page = self.request(self.item_url, params=params).text
urls = search_re.findall(page)
if len(urls) < 3:
if last:
return
fetch = False
break
fetch = True
for path in urls:
if last == path:
continue
last = path
yield self.root + path
params["page"] += 1
class ShopifyProductExtractor(ShopifyExtractor):
@ -121,7 +133,6 @@ EXTRACTORS = {
("https://www.fashionnova.com/collections/mini-dresses/?page=1"),
("https://www.fashionnova.com/collections/mini-dresses#1"),
),
},
}