gallery-dl/gallery_dl/extractor/imagehosts.py

# -*- coding: utf-8 -*-

# Copyright 2016 Mike Fährmann
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License version 2 as
# published by the Free Software Foundation.

"""Collection of extractors for various imagehosts"""

from .common import Extractor, Message
from .. import text
from os.path import splitext
from urllib.parse import urljoin

class ImagehostImageExtractor(Extractor):
    """Base class for single-image extractors for various imagehosts"""
    subcategory = "image"
    directory_fmt = ["{category}"]
    filename_fmt = "{filename}"
    https = False
    method = "post"
    params = "simple"
    cookies = None

    def __init__(self, match):
        Extractor.__init__(self)
        self.url = ("https://" if self.https else "http://") + match.group(1)
        self.token = match.group(2)
        if self.params == "simple":
            self.params = {
                "imgContinue": "Continue+to+image+...+",
            }
        elif self.params == "complex":
            self.params = {
                "op": "view",
                "id": self.token,
                "pre": "1",
                "adb": "1",
                "next": "Continue+to+image+...+",
            }
        else:
            self.params = {}
            self.method = "get"

    def items(self):
        page = self.request(self.url, method=self.method, data=self.params,
                            cookies=self.cookies).text
        url, filename = self.get_info(page)
        data = text.nameext_from_url(filename, {"token": self.token})
        if self.https and url.startswith("http:"):
            url = "https:" + url[5:]
        yield Message.Version, 1
        yield Message.Directory, data
        yield Message.Url, url, data

    def get_info(self, page):
        """Find image-url and string to get filename from"""
        return "url", "filename"


class ImgytImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from img.yt"""
    category = "imgyt"
    pattern = [r"(?:https?://)?((?:www\.)?img\.yt/img-([a-z0-9]+)\.html)"]
    test = [("https://img.yt/img-57a2050547b97.html", {
        "url": "6801fac1ff8335bd27a1665ad27ad64cace2cd84",
        "keyword": "7548cc9915f90f5d7ffbafa079085457ae34562c",
        "content": "54592f2635674c25677c6872db3709d343cdf92f",
    })]
    https = True

    def get_info(self, page):
        url     , pos = text.extract(page, "<img class='centred' src='", "'")
        filename, pos = text.extract(page, " alt='", "'", pos)
        return url, filename + splitext(url)[1]

class RapidimgImageExtractor(ImgytImageExtractor):
    """Extractor for single images from rapidimg.net"""
    category = "rapidimg"
    pattern = [r"(?:https?://)?((?:www\.)?rapidimg\.net/img-([a-z0-9]+)\.html)"]
    test = []
    https = False


class ChronosImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from chronos.to"""
    category = "chronos"
    pattern = [r"(?:https?://)?((?:www\.)?chronos\.to/([a-z0-9]{12}))"]
    test = [("http://chronos.to/bdrmq7rw7v4y", {
        "url": "7fcb3fe315c94283644d25ef47a644c2dc8da944",
        "keyword": "04dbc71a1154728d01c931308184050d61c5da55",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    https = False
    params = "complex"

    def get_info(self, page):
        url     , pos = text.extract(page, '<br><img src="', '"')
        filename, pos = text.extract(page, ' alt="', '"', pos)
        return url, filename

class CoreimgImageExtractor(ChronosImageExtractor):
    """Extractor for single images from coreimg.net"""
    category = "coreimg"
    pattern = [r"(?:https?://)?((?:www\.)?coreimg\.net/([a-z0-9]{12}))"]
    test = [("http://coreimg.net/ykcl5al8uzvg", {
        "url": "2b32596a2ea66b7cc784e20f3749f75f20998d78",
        "keyword": "8d71e5b820bc7177baee33ca529c91ae4521299f",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]

class ImgmaidImageExtractor(ChronosImageExtractor):
    """Extractor for single images from imgmaid.net"""
    category = "imgmaid"
    pattern = [r"(?:https?://)?((?:www\.)?imgmaid\.net/([a-z0-9]{12}))"]
    test = []
    https = True

class PicmaniacImageExtractor(ChronosImageExtractor):
    """Extractor for single images from pic-maniac.com"""
    category = "picmaniac"
    pattern = [r"(?:https?://)?((?:www\.)?pic-maniac\.com/([a-z0-9]{12}))"]
    test = []


class HosturimageImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from hosturimage.com"""
    category = "hosturimage"
    pattern = [(r"(?:https?://)?((?:www\.)?hosturimage\.com/"
                r"img-([a-z0-9]+)\.html)")]
    test = [("https://hosturimage.com/img-581ca97112bf8.html", {
        "url": "c672a3fd7fd48e5506d020aa19c4ac91ba078671",
        "keyword": "c3c94340b8e395e07b5145cf17534b5871ec8593",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    https = True

    def get_info(self, page):
        _  , pos = text.extract(page, '<div id="image_details">', '')
        url, pos = text.extract(page, "src='", "'", pos)
        return url, url

class ImageontimeImageExtractor(HosturimageImageExtractor):
    """Extractor for single images from imageontime.org"""
    category = "imageontime"
    pattern = [(r"(?:https?://)?((?:www\.)?imageontime\.org/"
                r"img-([a-z0-9]+)\.html)")]
    test = []
    https = False

class Img4everImageExtractor(HosturimageImageExtractor):
    """Extractor for single images from img4ever.net"""
    category = "img4ever"
    pattern = [(r"(?:https?://)?((?:www\.)?img4ever\.net/"
                r"img-([a-z0-9]+)\.html)")]
    test = []
    https = True

class ImguploadImageExtractor(HosturimageImageExtractor):
    """Extractor for single images from imgupload.yt"""
    category = "imgupload"
    pattern = [(r"(?:https?://)?((?:www\.)?imgupload\.yt/"
                r"img-([a-z0-9]+)\.html)")]
    test = []
    https = True


class ImgspotImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imgspot.org"""
    category = "imgspot"
    pattern = [r"(?:https?://)?((?:www\.)?imgspot\.org/img-([a-z0-9]+)\.html)"]
    https = False

    def get_info(self, page):
        url = text.extract(page, "<img class='centred_resized' src='", "'")[0]
        return url, url

class ImgtrialImageExtractor(ImgspotImageExtractor):
    """Extractor for single images from imgtrial.com"""
    category = "imgtrial"
    pattern = [r"(?:https?://)?((?:www\.)?imgtrial\.com/img-([a-z0-9]+)\.html)"]


class ImagevenueImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imagevenue.com"""
    category = "imagevenue"
    pattern = [(r"(?:https?://)?(img\d+\.imagevenue\.com/"
                r"img\.php\?image=(\d+)_.+)")]
    params = None

    def get_info(self, page):
        url = text.extract(page, 'SRC="', '"')[0]
        url = urljoin(self.url, url)
        return url, url


class ImagetwistImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imagetwist.com"""
    category = "imagetwist"
    pattern = [r"(?:https?://)?((?:www\.)?imagetwist\.com/([a-z0-9]{12}))"]
    test = [("http://imagetwist.com/4e46hv31tu0q/test.jpg", {
        "url": "6b3fc0bd1105b698d2d5844658ca674d66b1e2e7",
        "keyword": "30dd34dcb06b5b51c6cfff199c610b24edb7b9bc",
        "content": "96b1fd099b06faad5879fce23a7e4eb8290d8810",
    })]
    params = None

    def get_info(self, page):
        url     , pos = text.extract(page, 'center;"><img src="', '"')
        filename, pos = text.extract(page, ' alt="', '"', pos)
        return url, filename


class ImgcandyImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imgcandy.net"""
    category = "imgcandy"
    pattern = [(r"(?:https?://)?((?:www\.)?imgcandy\.net/img-([a-z0-9]+)"
                r"(?:_.+)?\.html)")]
    test = [("http://imgcandy.net/img-57d02527efee8_test-テスト.png.html", {
        "url": "bc3c9207b10dbfe8e65ccef5b9e3194a7427b4fa",
        "keyword": "1ed1587ef38a6b26ce28b35857a78417239d197a",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]

    def get_info(self, page):
        url = text.extract(page, "<img class='centred' src='", "'")[0]
        pos = self.url.find("_")
        return url, self.url[pos+1:-5] if pos != -1 else url


class ImgclickImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imgclick.net"""
    category = "imgclick"
    pattern = [r"(?:https?://)?((?:www\.)?imgclick\.net/([^/]+))"]
    test = [("http://imgclick.net/4tbrre1oxew9/test-_-_.png.html", {
        "url": "140dcb250a325f2d26b2d918c18b8ac6a2a0f6ab",
        "keyword": "c086439336eea3bdf773d761c32b0edb29af0ebd",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    params = "complex"

    def get_info(self, page):
        url     , pos = text.extract(page, '<img  src="', '"')
        filename, pos = text.extract(page, 'alt="', '"', pos)
        return url, filename


class ImgspiceImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imgspice.com"""
    category = "imgspice"
    pattern = [r"(?:https?://)?((?:www\.)?imgspice\.com/([^/]+))"]
    test = [("https://imgspice.com/zop38mvvq29u/", {
        "url": "a45833733c02b64d105363ffd8fd19f06992a2f7",
        "keyword": "5218f63195e6a487c0881fd1cda78c535c61b462",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    https = True
    params = None

    def get_info(self, page):
        filename, pos = text.extract(page, '<td nowrap>', '</td>')
        url     , pos = text.extract(page, '<img src="http://img', '"', pos)
        return "http://img" + url, text.unescape(filename)


class ImgtrexImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from imgtrex.com"""
    category = "imgtrex"
    pattern = [r"(?:https?://)?((?:www\.)?imgtrex\.com/([^/]+))"]
    test = [("http://imgtrex.com/im0ypxq0rke4/test-テスト-&<a>.png", {
        "url": "c000618bddda42bd599a590b7972c7396d19d8fe",
        "keyword": "58905795a9cd3f17d5ff024fc4d63645795ba23c",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    params = None

    def get_info(self, page):
        filename, pos = text.extract(page, '<title>ImgTrex: ', '</title>')
        url     , pos = text.extract(page, '<br>\n<img src="', '"', pos)
        return url, filename


class PixhostImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from pixhost.org"""
    category = "pixhost"
    pattern = [(r"(?:https?://)?((?:www\.)?pixhost\.org/show/"
                r"\d+/(\d+)_[^/]+)")]
    params = None
    cookies = {"pixhostads": "1", "pixhosttest": "1"}

    def get_info(self, page):
        filename, pos = text.extract(page, '<div id="text">\n<h2>', '</h2>')
        url     , pos = text.extract(page, '<img id="show_image" src="', '"', pos)
        pos = filename.find("_")
        return url, filename[pos+1:] if pos != -1 else url


class TurboimagehostImageExtractor(ImagehostImageExtractor):
    """Extractor for single images from turboimagehost.com"""
    category = "turboimagehost"
    pattern = [(r"(?:https?://)?((?:www\.)?turboimagehost\.com/p/(\d+)"
                r"/[^/]+\.html)")]
    test = [("http://www.turboimagehost.com/p/29690902/test--.png.html", {
        "url": "c624dc7784de515342117a2678fee6ecf1032d79",
        "keyword": "a4527f14675e4512ef317ee0401940c711fbe012",
        "content": "0c8768055e4e20e7c7259608b67799171b691140",
    })]
    params = None

    def get_info(self, page):
        needle = '<a href="http://www.turboimagehost.com"><img src="'
        url = text.extract(page, needle, '"')[0]
        return url, url