automatically detect and bypass cloudflare challenge pages

TODO: cache and re-apply cfclearance cookies
2024-11-22 18:53:21 +01:00 · 2019-03-10 15:31:33 +01:00 · 2019-03-10 15:31:33 +01:00 · 6dae6bee37
commit 6dae6bee37
parent 25aaf55514
5 changed files with 55 additions and 57 deletions
--- a/gallery_dl/cloudflare.py
+++ b/gallery_dl/cloudflare.py
@ -1,6 +1,6 @@
 # -*- coding: utf-8 -*-

-# Copyright 2015-2018 Mike Fährmann
+# Copyright 2015-2019 Mike Fährmann
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License version 2 as
@ -13,46 +13,52 @@ import time
 import operator
 import urllib.parse
 from . import text
-from .cache import cache


-def request_func(self, *args, **kwargs):
-    cookies = _cookiecache(self.root)
-    if cookies:
-        self.session.cookies.update(cookies)
-    response = self.session.get(*args, **kwargs)
-    if response.status_code == 503:
-        _cookiecache.invalidate(self.root)
-        self.log.info("Solving Cloudflare challenge")
-        response = solve_challenge(self.session, response)
-        _cookiecache(self.root, self.session.cookies)
-    return response
+def is_challenge(response):
+    return (response.status_code == 503 and
+            response.headers.get("Server", "").startswith("cloudflare") and
+            b"jschl-answer" in response.content)


-def solve_challenge(session, response):
+def solve_challenge(session, response, kwargs):
+    """Solve Cloudflare challenge and get cfclearance cookie"""
+    parsed = urllib.parse.urlsplit(response.url)
+    root = parsed.scheme + "://" + parsed.netloc
+
+    cf_kwargs = kwargs.copy()
+    headers = cf_kwargs["headers"] = (
+        kwargs["headers"].copy() if "headers" in kwargs else {})
+    params = cf_kwargs["params"] = (
+        kwargs["params"].copy() if "params" in kwargs else {})

-    session.headers["Referer"] = response.url
    page = response.text
-    params = text.extract_all(page, (
-        ('jschl_vc', 'name="jschl_vc" value="', '"'),
-        ('pass'    , 'name="pass" value="', '"'),
-    ))[0]
-    params["jschl_answer"] = solve_jschl(response.url, page)
+    params["pass"] = text.extract(page, 'name="pass" value="', '"')[0]
+    params["jschl_vc"] = text.extract(page, 'name="jschl_vc" value="', '"')[0]
+    params["jschl_answer"] = solve_js_challenge(page, parsed.netloc)
+    headers["Referer"] = response.url

    time.sleep(4)
-    url = text.urljoin(response.url, "/cdn-cgi/l/chk_jschl")
-    return session.get(url, params=params)
+
+    url = root + "/cdn-cgi/l/chk_jschl"
+    cf_kwargs["allow_redirects"] = False
+    cf_response = session.request(response.request.method, url, **cf_kwargs)
+
+    location = cf_response.headers["Location"]
+    if location[0] == "/":
+        location = root + location
+    return location


-def solve_jschl(url, page):
-    """Solve challenge to get 'jschl_answer' value"""
+def solve_js_challenge(page, netloc):
+    """Evaluate JS challenge in 'page' to get 'jschl_answer' value"""

    # build variable name
    # e.g. '...f, wqnVscP={"DERKbJk":+(...' --> wqnVscP.DERKbJk
    data, pos = text.extract_all(page, (
        ('var' , ',f, ', '='),
-        ('key' , '"', '"'),
-        ('expr', ':', '}'),
+        ('key' , '"'   , '"'),
+        ('expr', ':'   , '}'),
    ))
    variable = "{}.{}".format(data["var"], data["key"])
    vlength = len(variable)
@ -67,19 +73,19 @@ def solve_jschl(url, page):
    for expr in expressions.split(";")[1:]:

        if expr.startswith(variable):
-            # select arithmetc function based on operator (+, -, *)
-            func = operator_functions[expr[vlength]]
+            # select arithmetc function based on operator (+/-/*)
+            func = OPERATORS[expr[vlength]]
            # evaluate the rest of the expression
            value = evaluate_expression(expr[vlength+2:])
-            # combine the expression value with our current solution
+            # combine expression value with our current solution
            solution = func(solution, value)

        elif expr.startswith("a.value"):
-            # add length of the hostname, i.e. add 11 for 'example.org'
-            solution += len(urllib.parse.urlsplit(url).netloc)
+            # add length of hostname
+            solution += len(netloc)

            if ".toFixed(" in expr:
-                # trim the solution to 10 decimal places
+                # trim solution to 10 decimal places
                # and strip trailing zeros
                solution = "{:.10f}".format(solution).rstrip("0")

@ -87,7 +93,7 @@ def solve_jschl(url, page):


 def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
-    """Evaluate a Javascript expression for the challenge"""
+    """Evaluate a single Javascript expression for the challenge"""

    if "/" in expr:
        # split the expression in numerator and denominator subexpressions,
@ -102,26 +108,21 @@ def evaluate_expression(expr, split_re=re.compile(r"\(+([^)]*)\)")):
    result = ""
    for subexpr in split_re.findall(expr):
        result += str(sum(
-            expression_values[part]
+            VALUES[part]
            for part in subexpr.split("[]")
        ))
    return int(result)


-operator_functions = {
+OPERATORS = {
    "+": operator.add,
    "-": operator.sub,
    "*": operator.mul,
 }

-expression_values = {
+VALUES = {
    "": 0,
    "+": 0,
    "!+": 1,
    "+!!": 1,
 }
-
-
-@cache(maxage=365*24*60*60, keyarg=0)
-def _cookiecache(key, item=None):
-    return item
--- a/gallery_dl/extractor/common.py
+++ b/gallery_dl/extractor/common.py
@ -18,7 +18,7 @@ import requests
 import threading
 import http.cookiejar
 from .message import Message
-from .. import config, text, exception
+from .. import config, text, exception, cloudflare


 class Extractor():
@ -86,6 +86,10 @@ class Extractor():
                    if encoding:
                        response.encoding = encoding
                    return response
+                if cloudflare.is_challenge(response):
+                    self.log.info("Solving Cloudflare challenge")
+                    url = cloudflare.solve_challenge(session, response, kwargs)
+                    continue

                msg = "{}: {} for url: {}".format(code, response.reason, url)
                if code < 500 and code != 429:
--- a/gallery_dl/extractor/kissmanga.py
+++ b/gallery_dl/extractor/kissmanga.py
@ -9,17 +9,12 @@
 """Extract manga-chapters and entire manga from https://kissmanga.com/"""

 from .common import ChapterExtractor, MangaExtractor
-from .. import text, cloudflare, aes, exception
+from .. import text, aes, exception
 from ..cache import cache
 import hashlib
 import ast
 import re

-IV = [
-    0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
-    0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3
-]
-

 class KissmangaBase():
    """Base class for kissmanga extractors"""
@ -28,10 +23,10 @@ class KissmangaBase():
    root = "https://kissmanga.com"

    def request(self, url):
-        response = cloudflare.request_func(self, url)
+        response = super().request(url)
        if response.history and "/Message/AreYouHuman?" in response.url:
            self.log.error("Requesting too many pages caused a redirect to %s."
-                           " Try visiting this URL in your browser and solving"
+                           " Try visiting this URL in your browser and solve"
                           " the CAPTCHA to continue.", response.url)
            raise exception.StopExtraction()
        return response
@ -112,8 +107,10 @@ class KissmangaChapterExtractor(KissmangaBase, ChapterExtractor):
        self.session.headers["Referer"] = None
        try:
            key = self.build_aes_key(page)
+            iv = (0xa5, 0xe8, 0xe2, 0xe9, 0xc2, 0x72, 0x1b, 0xe0,
+                  0xa8, 0x4a, 0xd6, 0x60, 0xc4, 0x72, 0xc1, 0xf3)
            return [
-                (aes.aes_cbc_decrypt_text(data, key, IV), None)
+                (aes.aes_cbc_decrypt_text(data, key, iv), None)
                for data in text.extract_iter(
                    page, 'lstImages.push(wrapKA("', '"'
                )
--- a/gallery_dl/extractor/komikcast.py
+++ b/gallery_dl/extractor/komikcast.py
@ -9,7 +9,7 @@
 """Extract manga-chapters and entire manga from https://komikcast.com/"""

 from .common import ChapterExtractor, MangaExtractor
-from .. import text, cloudflare
+from .. import text
 import re


@ -18,8 +18,6 @@ class KomikcastBase():
    category = "komikcast"
    root = "https://komikcast.com"

-    request = cloudflare.request_func
-
    @staticmethod
    def parse_chapter_string(chapter_string, data=None):
        """Parse 'chapter_string' value and add its info to 'data'"""
--- a/gallery_dl/extractor/readcomiconline.py
+++ b/gallery_dl/extractor/readcomiconline.py
@ -9,7 +9,7 @@
 """Extract comic-issues and entire comics from https://readcomiconline.to/"""

 from .common import ChapterExtractor, MangaExtractor
-from .. import text, cloudflare
+from .. import text
 import re


@ -21,8 +21,6 @@ class ReadcomiconlineBase():
    archive_fmt = "{issue_id}_{page}"
    root = "https://readcomiconline.to"

-    request = cloudflare.request_func
-

 class ReadcomiconlineIssueExtractor(ReadcomiconlineBase, ChapterExtractor):
    """Extractor for comic-issues from readcomiconline.to"""