gallery-dl/gallery_dl/extractor/imagehosts.py

# -*- coding: utf-8 -*-

# Copyright 2016-2018 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Collection of extractors for various imagehosts"""

from .common import Extractor, Message
from .. import text, exception
from ..cache import memcache
from os.path import splitext
from urllib.parse import urljoin


class ImagehostImageExtractor(Extractor):
    """Base class for single-image extractors for various imagehosts"""
    subcategory = "image"
    archive_fmt = "{token}"
    https = False
    method = "post"
    params = "simple"
    cookies = None
    encoding = None

    def __init__(self, match):
        Extractor.__init__(self)
        self.url = ("https://" if self.https else "http://") + match.group(1)
        self.token = match.group(2)
        if self.params == "simple":
            self.params = {
                "imgContinue": "Continue+to+image+...+",
            }
        elif self.params == "complex":
            self.params = {
                "op": "view",
                "id": self.token,
                "pre": "1",
                "adb": "1",
                "next": "Continue+to+image+...+",
            }
        else:
            self.params = {}
            self.method = "get"

    def items(self):
        page = self.request(
            self.url,
            method=self.method,
            data=self.params,
            cookies=self.cookies,
            encoding=self.encoding,
        ).text

        url, filename = self.get_info(page)
        data = text.nameext_from_url(filename, {"token": self.token})
        if self.https and url.startswith("http:"):
            url = "https:" + url[5:]

        yield Message.Version, 1
        yield Message.Directory, data
        yield Message.Url, url, data

    def get_info(self, page):
        """Find image-url and string to get filename from"""


class ImxtoImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imx.to"""
    category = "imxto"
    pattern = [r"(?:https?://)?(?:www\.)?(imx\.to/i/(\w+))",
               r"(?:https?://)?(?:www\.)?((?:imx\.to|img\.yt)"
               r"/img-([a-z0-9]+)\.html)"]
    test = [
        ("https://imx.to/i/1qdeva", {  # new-style URL
            "url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130",
            "keyword": "7bb48a2327561ae04ea7a6d4e18e715379e2f497",
            "content": "0c8768055e4e20e7c7259608b67799171b691140",
        }),
        ("https://imx.to/img-57a2050547b97.html", {  # old-style URL
            "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
            "keyword": "451ad3d4745489c2e663acb1281d89c36ada940a",
            "content": "54592f2635674c25677c6872db3709d343cdf92f",
        }),
        ("https://img.yt/img-57a2050547b97.html", {  # img.yt domain
            "url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",
        }),
        ("https://imx.to/img-57a2050547b98.html", {
            "exception": exception.NotFoundError,
        }),
    ]
    https = True
    encoding = "utf-8"

    def __init__(self, match):
        ImagehostImageExtractor.__init__(self, match)
        if "/img-" in self.url:
            self.url = self.url.replace("img.yt", "imx.to")
            self.urlext = True
        else:
            self.urlext = False

    def get_info(self, page):
        url, pos = text.extract(
            page, '<div style="text-align:center;"><a href="', '"')
        if not url:
            raise exception.NotFoundError("image")
        filename, pos = text.extract(page, ' title="', '"', pos)
        if self.urlext and filename:
            filename += splitext(url)[1]
        return url, filename or url


class AcidimgImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from acidimg.cc"""
    category = "acidimg"
    pattern = [r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)"]
    test = [("https://acidimg.cc/img-5acb6b9de4640.html", {
        "url": "f132a630006e8d84f52d59555191ed82b3b64c04",
        "keyword": "183098c59d9244650f666b6cb4df96d76d2aeae8",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    https = True
    encoding = "utf-8"

    def get_info(self, page):
        url, pos = text.extract(page, "<img class='centred' src='", "'")
        if not url:
            raise exception.NotFoundError("image")
        filename, pos = text.extract(page, " alt='", "'", pos)
        return url, (filename + splitext(url)[1]) if filename else url


class ImagevenueImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imagevenue.com"""
    category = "imagevenue"
    pattern = [(r"(?:https?://)?(img\d+\.imagevenue\.com/"
                r"img\.php\?image=(\d+)_[^&#]+)")]
    params = None

    def get_info(self, page):
        url = text.extract(page, 'SRC="', '"')[0]
        url = urljoin(self.url, url)
        return url, url


class ImagetwistImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imagetwist.com"""
    category = "imagetwist"
    pattern = [r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))"]
    test = [("http://imagetwist.com/4e46hv31tu0q/test.jpg", {
        "url": "c999dc1a5dec0525ac9eb8c092f173dfe6dba0b0",
        "keyword": "30dd34dcb06b5b51c6cfff199c610b24edb7b9bc",
        "content": "96b1fd099b06faad5879fce23a7e4eb8290d8810",
    })]
    https = True
    params = None

    @property
    @memcache(maxage=3*60*60)
    def cookies(self):
        return self.request(self.url).cookies

    def get_info(self, page):
        url     , pos = text.extract(page, 'center;"><img src="', '"')
        filename, pos = text.extract(page, ' alt="', '"', pos)
        return url, filename


class ImgspiceImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imgspice.com"""
    category = "imgspice"
    pattern = [r"(?:https?://)?((?:www\.)?imgspice\.com/([^/]+))"]
    test = [("https://imgspice.com/zop38mvvq29u/", {
        "url": "a45833733c02b64d105363ffd8fd19f06992a2f7",
    })]
    https = True
    params = None

    def get_info(self, page):
        filename, pos = text.extract(page, '<td nowrap>', '</td>')
        url     , pos = text.extract(page, '<img src="https://img', '"', pos)
        return "https://img" + url, text.unescape(filename)


class PixhostImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from pixhost.org"""
    category = "pixhost"
    pattern = [(r"(?:https?://)?((?:www\.)?pixhost\.org/show/"
                r"\d+/(\d+)_[^/]+)")]
    https = True
    params = None
    cookies = {"pixhostads": "1", "pixhosttest": "1"}

    def get_info(self, page):
        url     , pos = text.extract(page, "class=\"image-img\" src=\"", "\"")
        filename, pos = text.extract(page, "alt=\"", "\"", pos)
        return url, filename


class PostimgImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from postimg.org"""
    category = "postimg"
    pattern = [(r"(?:https?://)?((?:www\.)?(?:postimg|pixxxels)\.org/"
                r"image/([^/]+)/?)")]
    https = True
    params = None

    def get_info(self, page):
        url = "https:" + text.extract(page, 'data-full="', '"')[0]
        return url, url


class TurboimagehostImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from turboimagehost.com"""
    category = "turboimagehost"
    pattern = [(r"(?:https?://)?((?:www\.)?turboimagehost\.com/p/(\d+)"
                r"/[^/]+\.html)")]
    test = [("https://www.turboimagehost.com/p/29690902/test--.png.html", {
        "url": "ada27a4e04f9ffd5ab7cd787f4559d5b3744520b",
        "keyword": "a4527f14675e4512ef317ee0401940c711fbe012",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    https = True
    params = None

    def get_info(self, page):
        needle = '://www.turboimagehost.com"><img src="'
        url = text.extract(page, needle, '"')[0]
        return url, url
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`# -- coding: utf-8 --`

set 'archive_fmt' values These are going to be used to create an unique id for each image. 2018-01-30 22:49:16 +01:00			`# Copyright 2016-2018 Mike Fährmann`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU General Public License version 2 as`
			`# published by the Free Software Foundation.`

move chronos & co. to imagehosts file 2016-11-03 18:14:33 +01:00			`"""Collection of extractors for various imagehosts"""`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
			`from .common import Extractor, Message`
[imgyt] raise NotFoundError instead of crashing 2017-02-02 15:52:48 +01:00			`from .. import text, exception`
[imagehosts] cleanup removed - chronos.to - unable to resolve hostname - coreimg.net - same - imgmaid.net - same - hosturimage.com - everything returns 404 - imageontime.org - redirects to some shady site - imgupload.yt - cloudflare error 522, host down - img4ever.net - read timeout 2018-02-22 23:50:50 +01:00			`from ..cache import memcache`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`from os.path import splitext`
[imagevenue] add extractor 2016-11-28 22:30:00 +01:00			`from urllib.parse import urljoin`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`class ImagehostImageExtractor(Extractor):`
move chronos & co. to imagehosts file 2016-11-03 18:14:33 +01:00			`"""Base class for single-image extractors for various imagehosts"""`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`subcategory = "image"`
set 'archive_fmt' values These are going to be used to create an unique id for each image. 2018-01-30 22:49:16 +01:00			`archive_fmt = "{token}"`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`https = False`
			`method = "post"`
			`params = "simple"`
[pixhost] add extractor 2016-11-09 12:03:14 +01:00			`cookies = None`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`encoding = None`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
			`def __init__(self, match):`
			`Extractor.__init__(self)`
			`self.url = ("https://" if self.https else "http://") + match.group(1)`
			`self.token = match.group(2)`
			`if self.params == "simple":`
			`self.params = {`
			`"imgContinue": "Continue+to+image+...+",`
			`}`
			`elif self.params == "complex":`
			`self.params = {`
			`"op": "view",`
			`"id": self.token,`
			`"pre": "1",`
			`"adb": "1",`
			`"next": "Continue+to+image+...+",`
			`}`
			`else:`
			`self.params = {}`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`self.method = "get"`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
			`def items(self):`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`page = self.request(`
			`self.url,`
			`method=self.method,`
			`data=self.params,`
			`cookies=self.cookies,`
			`encoding=self.encoding,`
			`).text`

combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`url, filename = self.get_info(page)`
			`data = text.nameext_from_url(filename, {"token": self.token})`
			`if self.https and url.startswith("http:"):`
			`url = "https:" + url[5:]`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`yield Message.Version, 1`
			`yield Message.Directory, data`
			`yield Message.Url, url, data`

move chronos & co. to imagehosts file 2016-11-03 18:14:33 +01:00			`def get_info(self, page):`
			`"""Find image-url and string to get filename from"""`

combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`class ImxtoImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from imx.to"""`
			`category = "imxto"`
			`pattern = [r"(?:https?://)?(?:www\.)?(imx\.to/i/(\w+))",`
			`r"(?:https?://)?(?:www\.)?((?:imx\.to\|img\.yt)"`
			`r"/img-([a-z0-9]+)\.html)"]`
[imgyt] raise NotFoundError instead of crashing 2017-02-02 15:52:48 +01:00			`test = [`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`("https://imx.to/i/1qdeva", { # new-style URL`
			`"url": "ab2173088a6cdef631d7a47dec4a5da1c6a00130",`
			`"keyword": "7bb48a2327561ae04ea7a6d4e18e715379e2f497",`
			`"content": "0c8768055e4e20e7c7259608b67799171b691140",`
			`}),`
			`("https://imx.to/img-57a2050547b97.html", { # old-style URL`
			`"url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",`
			`"keyword": "451ad3d4745489c2e663acb1281d89c36ada940a",`
[imgyt] raise NotFoundError instead of crashing 2017-02-02 15:52:48 +01:00			`"content": "54592f2635674c25677c6872db3709d343cdf92f",`
			`}),`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`("https://img.yt/img-57a2050547b97.html", { # img.yt domain`
			`"url": "a83fe6ef1909a318c4d49fcf2caf62f36c3f9204",`
			`}),`
			`("https://imx.to/img-57a2050547b98.html", {`
[imgyt] raise NotFoundError instead of crashing 2017-02-02 15:52:48 +01:00			`"exception": exception.NotFoundError,`
			`}),`
			`]`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`https = True`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`encoding = "utf-8"`

			`def __init__(self, match):`
			`ImagehostImageExtractor.__init__(self, match)`
			`if "/img-" in self.url:`
			`self.url = self.url.replace("img.yt", "imx.to")`
			`self.urlext = True`
			`else:`
			`self.urlext = False`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
			`def get_info(self, page):`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`url, pos = text.extract(`
			`page, '<div style="text-align:center;"><a href="', '"')`
[imgyt] raise NotFoundError instead of crashing 2017-02-02 15:52:48 +01:00			`if not url:`
			`raise exception.NotFoundError("image")`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`filename, pos = text.extract(page, ' title="', '"', pos)`
			`if self.urlext and filename:`
			`filename += splitext(url)[1]`
			`return url, filename or url`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00
code adjustments according to pep8 nr2 2017-02-01 00:53:19 +01:00
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`class AcidimgImageExtractor(ImagehostImageExtractor):`
[acidimg] add image extractor 2017-09-09 15:17:51 +02:00			`"""Extractor for single images from acidimg.cc"""`
			`category = "acidimg"`
			`pattern = [r"(?:https?://)?((?:www\.)?acidimg\.cc/img-([a-z0-9]+)\.html)"]`
replace 'imgyt' with 'imxto' https://img.yt/ wasn't available for a couple of days, but has now re-emerged as https://imx.to/ with a new web-interface. Links to older images still work (see tests). 2018-04-09 15:53:20 +02:00			`test = [("https://acidimg.cc/img-5acb6b9de4640.html", {`
			`"url": "f132a630006e8d84f52d59555191ed82b3b64c04",`
			`"keyword": "183098c59d9244650f666b6cb4df96d76d2aeae8",`
			`"content": "0c8768055e4e20e7c7259608b67799171b691140",`
			`})]`
			`https = True`
			`encoding = "utf-8"`

			`def get_info(self, page):`
			`url, pos = text.extract(page, "<img class='centred' src='", "'")`
			`if not url:`
			`raise exception.NotFoundError("image")`
			`filename, pos = text.extract(page, " alt='", "'", pos)`
			`return url, (filename + splitext(url)[1]) if filename else url`
[acidimg] add image extractor 2017-09-09 15:17:51 +02:00

[imagevenue] add extractor 2016-11-28 22:30:00 +01:00			`class ImagevenueImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from imagevenue.com"""`
			`category = "imagevenue"`
			`pattern = [(r"(?:https?://)?(img\d+\.imagevenue\.com/"`
fix/improve some regular expressions 2017-10-09 22:37:50 +02:00			`r"img\.php\?image=(\d+)_[^&#]+)")]`
[imagevenue] add extractor 2016-11-28 22:30:00 +01:00			`params = None`

			`def get_info(self, page):`
			`url = text.extract(page, 'SRC="', '"')[0]`
			`url = urljoin(self.url, url)`
			`return url, url`


move another 4 extractors 2016-11-04 09:33:38 +01:00			`class ImagetwistImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from imagetwist.com"""`
			`category = "imagetwist"`
			`pattern = [r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))"]`
			`test = [("http://imagetwist.com/4e46hv31tu0q/test.jpg", {`
[imagetwist] use https 2017-06-24 16:21:00 +02:00			`"url": "c999dc1a5dec0525ac9eb8c092f173dfe6dba0b0",`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`"keyword": "30dd34dcb06b5b51c6cfff199c610b24edb7b9bc",`
			`"content": "96b1fd099b06faad5879fce23a7e4eb8290d8810",`
			`})]`
[imagetwist] use https 2017-06-24 16:21:00 +02:00			`https = True`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`params = None`

[imagetwist] fix site access 2017-02-08 22:59:00 +01:00			`@property`
[imagehosts] cleanup removed - chronos.to - unable to resolve hostname - coreimg.net - same - imgmaid.net - same - hosturimage.com - everything returns 404 - imageontime.org - redirects to some shady site - imgupload.yt - cloudflare error 522, host down - img4ever.net - read timeout 2018-02-22 23:50:50 +01:00			`@memcache(maxage=36060)`
[imagetwist] fix site access 2017-02-08 22:59:00 +01:00			`def cookies(self):`
			`return self.request(self.url).cookies`

combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`def get_info(self, page):`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`url , pos = text.extract(page, 'center;"><img src="', '"')`
			`filename, pos = text.extract(page, ' alt="', '"', pos)`
combine imagehost extractors into a single file added extractors for - hosturimage.com - imageontime.org - imgupload.yt - imgspice.com - imgclick.net All of these would have shared a lot of common code, so i created a base class for imagehost extractors and put them all in the same file to avoid clutter. 2016-11-03 15:46:04 +01:00			`return url, filename`

move another 4 extractors 2016-11-04 09:33:38 +01:00
			`class ImgspiceImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from imgspice.com"""`
			`category = "imgspice"`
			`pattern = [r"(?:https?://)?((?:www\.)?imgspice\.com/([^/]+))"]`
add tests 2016-11-05 13:51:52 +01:00			`test = [("https://imgspice.com/zop38mvvq29u/", {`
			`"url": "a45833733c02b64d105363ffd8fd19f06992a2f7",`
			`})]`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`https = True`
			`params = None`

			`def get_info(self, page):`
			`filename, pos = text.extract(page, '<td nowrap>', '</td>')`
[imgspice] fix extraction 2017-09-26 17:08:02 +02:00			`url , pos = text.extract(page, '<img src="https://img', '"', pos)`
[imagehosts] remove even more dead sites All removed sites either - reject all incoming connections or - display a message from their domain registrar 2018-03-12 21:25:13 +01:00			`return "https://img" + url, text.unescape(filename)`
[imgtrex] re-add extractor 2017-03-21 15:47:51 +01:00

[pixhost] add extractor 2016-11-09 12:03:14 +01:00			`class PixhostImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from pixhost.org"""`
			`category = "pixhost"`
			`pattern = [(r"(?:https?://)?((?:www\.)?pixhost\.org/show/"`
			`r"\d+/(\d+)_[^/]+)")]`
[pixhost] adjust to new site layout 2016-12-06 10:03:33 +01:00			`https = True`
[pixhost] add extractor 2016-11-09 12:03:14 +01:00			`params = None`
			`cookies = {"pixhostads": "1", "pixhosttest": "1"}`

			`def get_info(self, page):`
fixed the module for pixhost 2017-04-21 13:54:10 +02:00			`url , pos = text.extract(page, "class=\"image-img\" src=\"", "\"")`
			`filename, pos = text.extract(page, "alt=\"", "\"", pos)`
[pixhost] adjust to new site layout 2016-12-06 10:03:33 +01:00			`return url, filename`
[pixhost] add extractor 2016-11-09 12:03:14 +01:00

[postimg] add extractor 2016-12-06 12:46:41 +01:00			`class PostimgImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from postimg.org"""`
			`category = "postimg"`
			`pattern = [(r"(?:https?://)?((?:www\.)?(?:postimg\|pixxxels)\.org/"`
			`r"image/([^/]+)/?)")]`
			`https = True`
			`params = None`

			`def get_info(self, page):`
			`url = "https:" + text.extract(page, 'data-full="', '"')[0]`
			`return url, url`


move another 4 extractors 2016-11-04 09:33:38 +01:00			`class TurboimagehostImageExtractor(ImagehostImageExtractor):`
			`"""Extractor for single images from turboimagehost.com"""`
			`category = "turboimagehost"`
add tests 2016-11-05 13:51:52 +01:00			`pattern = [(r"(?:https?://)?((?:www\.)?turboimagehost\.com/p/(\d+)"`
			`r"/[^/]+\.html)")]`
[imagehosts] remove even more dead sites All removed sites either - reject all incoming connections or - display a message from their domain registrar 2018-03-12 21:25:13 +01:00			`test = [("https://www.turboimagehost.com/p/29690902/test--.png.html", {`
fix tests for turboimagehost and pinterest 2017-01-27 22:39:44 +01:00			`"url": "ada27a4e04f9ffd5ab7cd787f4559d5b3744520b",`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`"keyword": "a4527f14675e4512ef317ee0401940c711fbe012",`
			`"content": "0c8768055e4e20e7c7259608b67799171b691140",`
			`})]`
[turboimagehost] fix extraction 2018-03-06 14:25:10 +01:00			`https = True`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`params = None`

			`def get_info(self, page):`
[turboimagehost] fix extraction 2018-03-06 14:25:10 +01:00			`needle = '://www.turboimagehost.com"><img src="'`
move another 4 extractors 2016-11-04 09:33:38 +01:00			`url = text.extract(page, needle, '"')[0]`
			`return url, url`