commit 14b5cc238350af631f5302f865ec4b6ec12bf385 Author: Alex Thomassen Date: Sat Oct 29 21:47:05 2022 +0000 Initial commit diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f2d31a7 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# Relative paths are relative to the current working directory (`pwd`), so explicit full paths are recommended. +# See *.sample.* files for test runs and examples. + +# Simply a list of Oda.com product URLs to scrape. One URL per line. +PRODUCT_URL_LIST=./data/products.txt + +# The output file for the scraped data. This file will be overwritten if it already exists. +PRODUCT_OUTPUT_FILE=./data/products.csv + +# Can be either 'json' or 'csv' +PRODUCT_OUTPUT_FORMAT=json + +# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt. +USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f9eee55 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor +.env \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4d87c6a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Alex Thomassen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/OdaScraper.php b/OdaScraper.php new file mode 100644 index 0000000..a932496 --- /dev/null +++ b/OdaScraper.php @@ -0,0 +1,296 @@ +load(); +} + +class Oda +{ + /** + * CookieJar for Guzzle + * + * @var GuzzleHttp\Cookie\CookieJar + */ + private CookieJar $cookies; + + /** + * URL to send the initial request (for cookies). + * + * @var string + */ + private string $initUrl = 'https://oda.com/no/products/popular/'; + + /** + * URL for querying the API. + * + * @var string + */ + private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/'; + + /** + * Base HTTP headers + * + * API requires additional headers: + * - Accept: application/json (probably) + * - Cookie + * - X-CSRFToken + * + * @var array + */ + private array $headers = [ + 'Origin' => 'https://oda.com', + 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', + ]; + + /** + * CSRF token used for API requests + * + * @var string + */ + private ?string $csrfToken = null; + + /** + * HTTP Client + * + * @var \GuzzleHttp\Client + */ + private ?HttpClient $client; + + public function __construct() + { + $this->cookies = new CookieJar(); + $this->client = new HttpClient(['cookies' => true]); + + if (!empty($_ENV['USER_AGENT'])) { + $this->headers['User-Agent'] = $_ENV['USER_AGENT']; + } + } + + /** + * Does the initial request to get the CSRF token and cookies. + */ + public function initialize() : void + { + $options = [ + RequestOptions::HEADERS => $this->headers, + RequestOptions::COOKIES => $this->cookies, + ]; + + $this->client->request('GET', $this->initUrl, $options); + $csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null; + + $this->csrfToken = $csrfToken; + } + + /** + * Helper function for sending requests via the HTTP client. + * + * @param string $url + * @param string $method HTTP method (GET, POST, PUT etc.) + * @param array $options Guzzle options + * + * @return string + */ + private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string + { + if (empty($options)) { + $options = [ + RequestOptions::HEADERS => $this->headers, + RequestOptions::COOKIES => $this->cookies, + ]; + } + + $response = $this->client->request($method, $url, $options); + $body = $response->getBody()->getContents(); + return $body; + } + + /** + * Helper function for making products API requests. + * + * @param array $excludeIds + * + * @return array + */ + public function getPopularProducts($excludeIds = []) : array + { + $headers = $this->headers; + $headers['Accept'] = 'application/json'; + $headers['X-CSRFToken'] = $this->csrfToken; + + $options = [ + RequestOptions::HEADERS => $headers, + RequestOptions::COOKIES => $this->cookies, + RequestOptions::JSON => [ + 'exclude_ids' => $excludeIds, + ], + ]; + + $body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options); + + return json_decode($body, true); + } + + /** + * Runs the scraper to scrape popular products + * + * This relies on the Oda API, unlike the scraping. + * + * @return void + */ + public function fetchPopular() : void + { + $this->initialize(); + + $excludeIds = []; + $count = 0; + $products = []; + + $initialRequest = $this->getPopularProducts(); + + $results = $initialRequest['results']; + + $products = array_merge($products, $results); + $productIds = array_column($results, 'id'); + $excludeIds = array_merge($excludeIds, $productIds); + + $hasMore = $initialRequest['has_more']; + + echo sprintf('Scraped %d products', count($products)) . PHP_EOL; + + while ($hasMore) { + $count++; + $request = $this->getPopularProducts($excludeIds); + + $results = $request['results']; + $products = array_merge($products, $results); + + echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL; + + $productIds = array_column($results, 'id'); + $excludeIds = array_merge($excludeIds, $productIds); + + $hasMore = $request['has_more']; + + if (!$hasMore) { + break; + } + } + + $filename = sprintf('%s/data/%s.json', __DIR__, time()); + file_put_contents($filename, json_encode($products)); + } + + /** + * Takes a Oda product page URL and scrapes the product data. + * + * @param string $url + * + * @return array + */ + public function scrapeProductPage(string $url) : array + { + $body = $this->getRequestBody($url); + $crawler = new Crawler($body); + + $name = trim((string) $crawler->filter('h1')->text()); + $price = (float) $crawler->filter('.price')->attr('content'); + + return [ + 'name' => $name, + 'price' => $price, + ]; + } + + /** + * Writes the scraped data to a CSV file. + * + * @param array $products + * + * @return void + */ + private function writeProductsToFile(array $products) : void + { + if (empty($products)) { + return; + } + + $outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT']; + $outputFile = $_ENV['PRODUCT_OUTPUT_FILE']; + + if (empty($outputFormat) || empty($outputFile)) { + echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL; + return; + } + + if ($outputFormat === 'json') { + file_put_contents($outputFile, json_encode($products)); + return; + } + + if ($outputFormat === 'csv') { + $handle = fopen($outputFile, 'w'); + + // Add the first row as headers. Assuming all products are formatted the same, of course. + $columns = array_keys($products[0]); + fputcsv($handle, $columns); + + foreach ($products as $product) + { + fputcsv($handle, $product); + } + + fclose($handle); + return; + } + + echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL); + } + + /** + * Scrapes products from the specified `PRODUCT_URL_LIST` file. + * Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future. + * + * Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format. + * + * @return array + */ + public function scrapeFromFile() : array + { + $filename = $_ENV['PRODUCT_URL_LIST']; + + if (empty($filename)) { + echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL; + return []; + } + + $inputHandle = fopen($filename, 'r'); + $products = []; + + while (($line = fgets($inputHandle)) !== false) { + $url = trim($line); + $product = $this->scrapeProductPage($url); + $products[] = $product; + } + + fclose($inputHandle); + + $this->writeProductsToFile($products); + return $products; + } +} + +$oda = new Oda(); +$oda->scrapeFromFile(); \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..99364f9 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +# Scraper for Oda.com + +## Requirements + +- PHP 8.1 (cli) +- PHP extension: `php-xml` +- PHP extension: `php-curl` +- [`Composer`](https://getcomposer.org/) + - Other PHP extensions may also be necessary to install. Composer will likely complain when you run `composer install`. + +## Setup + +- Copy `.env.example` to `.env` and fill in the values as you see fit. +- Run `composer install` to install dependencies. +- Run `php OdaScraper.php` to scrape the products. +- Script something to send the output files to wherever you want? \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..9fb76b2 --- /dev/null +++ b/composer.json @@ -0,0 +1,8 @@ +{ + "require": { + "guzzlehttp/guzzle": "^7.5", + "symfony/dom-crawler": "^6.1", + "symfony/css-selector": "^6.1", + "vlucas/phpdotenv": "^5.5" + } +} diff --git a/composer.lock b/composer.lock new file mode 100644 index 0000000..967e551 --- /dev/null +++ b/composer.lock @@ -0,0 +1,1294 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "e596a64ebcd46e365fa53285412a5a5f", + "packages": [ + { + "name": "graham-campbell/result-type", + "version": "v1.1.0", + "source": { + "type": "git", + "url": "https://github.com/GrahamCampbell/Result-Type.git", + "reference": "a878d45c1914464426dc94da61c9e1d36ae262a8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/GrahamCampbell/Result-Type/zipball/a878d45c1914464426dc94da61c9e1d36ae262a8", + "reference": "a878d45c1914464426dc94da61c9e1d36ae262a8", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0", + "phpoption/phpoption": "^1.9" + }, + "require-dev": { + "phpunit/phpunit": "^8.5.28 || ^9.5.21" + }, + "type": "library", + "autoload": { + "psr-4": { + "GrahamCampbell\\ResultType\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + } + ], + "description": "An Implementation Of The Result Type", + "keywords": [ + "Graham Campbell", + "GrahamCampbell", + "Result Type", + "Result-Type", + "result" + ], + "support": { + "issues": "https://github.com/GrahamCampbell/Result-Type/issues", + "source": "https://github.com/GrahamCampbell/Result-Type/tree/v1.1.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/graham-campbell/result-type", + "type": "tidelift" + } + ], + "time": "2022-07-30T15:56:11+00:00" + }, + { + "name": "guzzlehttp/guzzle", + "version": "7.5.0", + "source": { + "type": "git", + "url": "https://github.com/guzzle/guzzle.git", + "reference": "b50a2a1251152e43f6a37f0fa053e730a67d25ba" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/guzzle/zipball/b50a2a1251152e43f6a37f0fa053e730a67d25ba", + "reference": "b50a2a1251152e43f6a37f0fa053e730a67d25ba", + "shasum": "" + }, + "require": { + "ext-json": "*", + "guzzlehttp/promises": "^1.5", + "guzzlehttp/psr7": "^1.9 || ^2.4", + "php": "^7.2.5 || ^8.0", + "psr/http-client": "^1.0", + "symfony/deprecation-contracts": "^2.2 || ^3.0" + }, + "provide": { + "psr/http-client-implementation": "1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.1", + "ext-curl": "*", + "php-http/client-integration-tests": "^3.0", + "phpunit/phpunit": "^8.5.29 || ^9.5.23", + "psr/log": "^1.1 || ^2.0 || ^3.0" + }, + "suggest": { + "ext-curl": "Required for CURL handler support", + "ext-intl": "Required for Internationalized Domain Name (IDN) support", + "psr/log": "Required for using the Log middleware" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + }, + "branch-alias": { + "dev-master": "7.5-dev" + } + }, + "autoload": { + "files": [ + "src/functions_include.php" + ], + "psr-4": { + "GuzzleHttp\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "Jeremy Lindblom", + "email": "jeremeamia@gmail.com", + "homepage": "https://github.com/jeremeamia" + }, + { + "name": "George Mponos", + "email": "gmponos@gmail.com", + "homepage": "https://github.com/gmponos" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://github.com/sagikazarmark" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + } + ], + "description": "Guzzle is a PHP HTTP client library", + "keywords": [ + "client", + "curl", + "framework", + "http", + "http client", + "psr-18", + "psr-7", + "rest", + "web service" + ], + "support": { + "issues": "https://github.com/guzzle/guzzle/issues", + "source": "https://github.com/guzzle/guzzle/tree/7.5.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/guzzle", + "type": "tidelift" + } + ], + "time": "2022-08-28T15:39:27+00:00" + }, + { + "name": "guzzlehttp/promises", + "version": "1.5.2", + "source": { + "type": "git", + "url": "https://github.com/guzzle/promises.git", + "reference": "b94b2807d85443f9719887892882d0329d1e2598" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/promises/zipball/b94b2807d85443f9719887892882d0329d1e2598", + "reference": "b94b2807d85443f9719887892882d0329d1e2598", + "shasum": "" + }, + "require": { + "php": ">=5.5" + }, + "require-dev": { + "symfony/phpunit-bridge": "^4.4 || ^5.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.5-dev" + } + }, + "autoload": { + "files": [ + "src/functions_include.php" + ], + "psr-4": { + "GuzzleHttp\\Promise\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + } + ], + "description": "Guzzle promises library", + "keywords": [ + "promise" + ], + "support": { + "issues": "https://github.com/guzzle/promises/issues", + "source": "https://github.com/guzzle/promises/tree/1.5.2" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/promises", + "type": "tidelift" + } + ], + "time": "2022-08-28T14:55:35+00:00" + }, + { + "name": "guzzlehttp/psr7", + "version": "2.4.3", + "source": { + "type": "git", + "url": "https://github.com/guzzle/psr7.git", + "reference": "67c26b443f348a51926030c83481b85718457d3d" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/psr7/zipball/67c26b443f348a51926030c83481b85718457d3d", + "reference": "67c26b443f348a51926030c83481b85718457d3d", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0", + "psr/http-factory": "^1.0", + "psr/http-message": "^1.0", + "ralouphie/getallheaders": "^3.0" + }, + "provide": { + "psr/http-factory-implementation": "1.0", + "psr/http-message-implementation": "1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.1", + "http-interop/http-factory-tests": "^0.9", + "phpunit/phpunit": "^8.5.29 || ^9.5.23" + }, + "suggest": { + "laminas/laminas-httphandlerrunner": "Emit PSR-7 responses" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + }, + "branch-alias": { + "dev-master": "2.4-dev" + } + }, + "autoload": { + "psr-4": { + "GuzzleHttp\\Psr7\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "George Mponos", + "email": "gmponos@gmail.com", + "homepage": "https://github.com/gmponos" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://github.com/sagikazarmark" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://sagikazarmark.hu" + } + ], + "description": "PSR-7 message implementation that also provides common utility methods", + "keywords": [ + "http", + "message", + "psr-7", + "request", + "response", + "stream", + "uri", + "url" + ], + "support": { + "issues": "https://github.com/guzzle/psr7/issues", + "source": "https://github.com/guzzle/psr7/tree/2.4.3" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/psr7", + "type": "tidelift" + } + ], + "time": "2022-10-26T14:07:24+00:00" + }, + { + "name": "masterminds/html5", + "version": "2.7.6", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "897eb517a343a2281f11bc5556d6548db7d93947" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/897eb517a343a2281f11bc5556d6548db7d93947", + "reference": "897eb517a343a2281f11bc5556d6548db7d93947", + "shasum": "" + }, + "require": { + "ext-ctype": "*", + "ext-dom": "*", + "ext-libxml": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.7.6" + }, + "time": "2022-08-18T16:18:26+00:00" + }, + { + "name": "phpoption/phpoption", + "version": "1.9.0", + "source": { + "type": "git", + "url": "https://github.com/schmittjoh/php-option.git", + "reference": "dc5ff11e274a90cc1c743f66c9ad700ce50db9ab" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/schmittjoh/php-option/zipball/dc5ff11e274a90cc1c743f66c9ad700ce50db9ab", + "reference": "dc5ff11e274a90cc1c743f66c9ad700ce50db9ab", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8", + "phpunit/phpunit": "^8.5.28 || ^9.5.21" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": true + }, + "branch-alias": { + "dev-master": "1.9-dev" + } + }, + "autoload": { + "psr-4": { + "PhpOption\\": "src/PhpOption/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Apache-2.0" + ], + "authors": [ + { + "name": "Johannes M. Schmitt", + "email": "schmittjoh@gmail.com", + "homepage": "https://github.com/schmittjoh" + }, + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + } + ], + "description": "Option Type for PHP", + "keywords": [ + "language", + "option", + "php", + "type" + ], + "support": { + "issues": "https://github.com/schmittjoh/php-option/issues", + "source": "https://github.com/schmittjoh/php-option/tree/1.9.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/phpoption/phpoption", + "type": "tidelift" + } + ], + "time": "2022-07-30T15:51:26+00:00" + }, + { + "name": "psr/http-client", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-client.git", + "reference": "2dfb5f6c5eff0e91e20e913f8c5452ed95b86621" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-client/zipball/2dfb5f6c5eff0e91e20e913f8c5452ed95b86621", + "reference": "2dfb5f6c5eff0e91e20e913f8c5452ed95b86621", + "shasum": "" + }, + "require": { + "php": "^7.0 || ^8.0", + "psr/http-message": "^1.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Client\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interface for HTTP clients", + "homepage": "https://github.com/php-fig/http-client", + "keywords": [ + "http", + "http-client", + "psr", + "psr-18" + ], + "support": { + "source": "https://github.com/php-fig/http-client/tree/master" + }, + "time": "2020-06-29T06:28:15+00:00" + }, + { + "name": "psr/http-factory", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-factory.git", + "reference": "12ac7fcd07e5b077433f5f2bee95b3a771bf61be" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-factory/zipball/12ac7fcd07e5b077433f5f2bee95b3a771bf61be", + "reference": "12ac7fcd07e5b077433f5f2bee95b3a771bf61be", + "shasum": "" + }, + "require": { + "php": ">=7.0.0", + "psr/http-message": "^1.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interfaces for PSR-7 HTTP message factories", + "keywords": [ + "factory", + "http", + "message", + "psr", + "psr-17", + "psr-7", + "request", + "response" + ], + "support": { + "source": "https://github.com/php-fig/http-factory/tree/master" + }, + "time": "2019-04-30T12:38:16+00:00" + }, + { + "name": "psr/http-message", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-message.git", + "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-message/zipball/f6561bf28d520154e4b0ec72be95418abe6d9363", + "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interface for HTTP messages", + "homepage": "https://github.com/php-fig/http-message", + "keywords": [ + "http", + "http-message", + "psr", + "psr-7", + "request", + "response" + ], + "support": { + "source": "https://github.com/php-fig/http-message/tree/master" + }, + "time": "2016-08-06T14:39:51+00:00" + }, + { + "name": "ralouphie/getallheaders", + "version": "3.0.3", + "source": { + "type": "git", + "url": "https://github.com/ralouphie/getallheaders.git", + "reference": "120b605dfeb996808c31b6477290a714d356e822" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ralouphie/getallheaders/zipball/120b605dfeb996808c31b6477290a714d356e822", + "reference": "120b605dfeb996808c31b6477290a714d356e822", + "shasum": "" + }, + "require": { + "php": ">=5.6" + }, + "require-dev": { + "php-coveralls/php-coveralls": "^2.1", + "phpunit/phpunit": "^5 || ^6.5" + }, + "type": "library", + "autoload": { + "files": [ + "src/getallheaders.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Ralph Khattar", + "email": "ralph.khattar@gmail.com" + } + ], + "description": "A polyfill for getallheaders.", + "support": { + "issues": "https://github.com/ralouphie/getallheaders/issues", + "source": "https://github.com/ralouphie/getallheaders/tree/develop" + }, + "time": "2019-03-08T08:55:37+00:00" + }, + { + "name": "symfony/css-selector", + "version": "v6.1.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/css-selector.git", + "reference": "0dd5e36b80e1de97f8f74ed7023ac2b837a36443" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/css-selector/zipball/0dd5e36b80e1de97f8f74ed7023ac2b837a36443", + "reference": "0dd5e36b80e1de97f8f74ed7023ac2b837a36443", + "shasum": "" + }, + "require": { + "php": ">=8.1" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\CssSelector\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Jean-François Simon", + "email": "jeanfrancois.simon@sensiolabs.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Converts CSS selectors to XPath expressions", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/css-selector/tree/v6.1.3" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-06-27T17:24:16+00:00" + }, + { + "name": "symfony/deprecation-contracts", + "version": "v3.1.1", + "source": { + "type": "git", + "url": "https://github.com/symfony/deprecation-contracts.git", + "reference": "07f1b9cc2ffee6aaafcf4b710fbc38ff736bd918" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/07f1b9cc2ffee6aaafcf4b710fbc38ff736bd918", + "reference": "07f1b9cc2ffee6aaafcf4b710fbc38ff736bd918", + "shasum": "" + }, + "require": { + "php": ">=8.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "3.1-dev" + }, + "thanks": { + "name": "symfony/contracts", + "url": "https://github.com/symfony/contracts" + } + }, + "autoload": { + "files": [ + "function.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "A generic function and convention to trigger deprecation notices", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/deprecation-contracts/tree/v3.1.1" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-02-25T11:15:52+00:00" + }, + { + "name": "symfony/dom-crawler", + "version": "v6.1.4", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "8cb4c6e6c8d30c26f70529ed5e50d79a09576c0c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/8cb4c6e6c8d30c26f70529ed5e50d79a09576c0c", + "reference": "8cb4c6e6c8d30c26f70529ed5e50d79a09576c0c", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.1", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^5.4|^6.0" + }, + "suggest": { + "symfony/css-selector": "" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v6.1.4" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-08-04T19:19:00+00:00" + }, + { + "name": "symfony/polyfill-ctype", + "version": "v1.26.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-ctype.git", + "reference": "6fd1b9a79f6e3cf65f9e679b23af304cd9e010d4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-ctype/zipball/6fd1b9a79f6e3cf65f9e679b23af304cd9e010d4", + "reference": "6fd1b9a79f6e3cf65f9e679b23af304cd9e010d4", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "provide": { + "ext-ctype": "*" + }, + "suggest": { + "ext-ctype": "For best performance" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "1.26-dev" + }, + "thanks": { + "name": "symfony/polyfill", + "url": "https://github.com/symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Ctype\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Gert de Pagter", + "email": "BackEndTea@gmail.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for ctype functions", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "ctype", + "polyfill", + "portable" + ], + "support": { + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.26.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-05-24T11:49:31+00:00" + }, + { + "name": "symfony/polyfill-mbstring", + "version": "v1.26.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-mbstring.git", + "reference": "9344f9cb97f3b19424af1a21a3b0e75b0a7d8d7e" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/9344f9cb97f3b19424af1a21a3b0e75b0a7d8d7e", + "reference": "9344f9cb97f3b19424af1a21a3b0e75b0a7d8d7e", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "provide": { + "ext-mbstring": "*" + }, + "suggest": { + "ext-mbstring": "For best performance" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "1.26-dev" + }, + "thanks": { + "name": "symfony/polyfill", + "url": "https://github.com/symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Mbstring\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for the Mbstring extension", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "mbstring", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.26.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-05-24T11:49:31+00:00" + }, + { + "name": "symfony/polyfill-php80", + "version": "v1.26.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-php80.git", + "reference": "cfa0ae98841b9e461207c13ab093d76b0fa7bace" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-php80/zipball/cfa0ae98841b9e461207c13ab093d76b0fa7bace", + "reference": "cfa0ae98841b9e461207c13ab093d76b0fa7bace", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "1.26-dev" + }, + "thanks": { + "name": "symfony/polyfill", + "url": "https://github.com/symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Php80\\": "" + }, + "classmap": [ + "Resources/stubs" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Ion Bazan", + "email": "ion.bazan@gmail.com" + }, + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill backporting some PHP 8.0+ features to lower PHP versions", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-php80/tree/v1.26.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-05-10T07:21:04+00:00" + }, + { + "name": "vlucas/phpdotenv", + "version": "v5.5.0", + "source": { + "type": "git", + "url": "https://github.com/vlucas/phpdotenv.git", + "reference": "1a7ea2afc49c3ee6d87061f5a233e3a035d0eae7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/vlucas/phpdotenv/zipball/1a7ea2afc49c3ee6d87061f5a233e3a035d0eae7", + "reference": "1a7ea2afc49c3ee6d87061f5a233e3a035d0eae7", + "shasum": "" + }, + "require": { + "ext-pcre": "*", + "graham-campbell/result-type": "^1.0.2", + "php": "^7.1.3 || ^8.0", + "phpoption/phpoption": "^1.8", + "symfony/polyfill-ctype": "^1.23", + "symfony/polyfill-mbstring": "^1.23.1", + "symfony/polyfill-php80": "^1.23.1" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.4.1", + "ext-filter": "*", + "phpunit/phpunit": "^7.5.20 || ^8.5.30 || ^9.5.25" + }, + "suggest": { + "ext-filter": "Required to use the boolean validator." + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": true + }, + "branch-alias": { + "dev-master": "5.5-dev" + } + }, + "autoload": { + "psr-4": { + "Dotenv\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Vance Lucas", + "email": "vance@vancelucas.com", + "homepage": "https://github.com/vlucas" + } + ], + "description": "Loads environment variables from `.env` to `getenv()`, `$_ENV` and `$_SERVER` automagically.", + "keywords": [ + "dotenv", + "env", + "environment" + ], + "support": { + "issues": "https://github.com/vlucas/phpdotenv/issues", + "source": "https://github.com/vlucas/phpdotenv/tree/v5.5.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/vlucas/phpdotenv", + "type": "tidelift" + } + ], + "time": "2022-10-16T01:01:54+00:00" + } + ], + "packages-dev": [], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": [], + "platform-dev": [], + "plugin-api-version": "2.3.0" +} diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ea1e8a3 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!*.sample.* \ No newline at end of file diff --git a/data/products.sample.csv b/data/products.sample.csv new file mode 100644 index 0000000..c112dbf --- /dev/null +++ b/data/products.sample.csv @@ -0,0 +1,5 @@ +name,price +"Ternet Bacon 2x125g, 250 g",45.7 +"Hamar Julebrus 6 x 0,33l, 1,98 l",128.4 +"Frokostegg Fra Frittgående Høner Str L, 12 stk",43 +"Monster Ultra Watermelon 0,5 l",23 diff --git a/data/products.sample.json b/data/products.sample.json new file mode 100644 index 0000000..5543cbf --- /dev/null +++ b/data/products.sample.json @@ -0,0 +1 @@ +[{"name":"Ternet Bacon 2x125g, 250 g","price":45.7},{"name":"Hamar Julebrus 6 x 0,33l, 1,98 l","price":128.4},{"name":"Frokostegg Fra Frittg\u00e5ende H\u00f8ner Str L, 12 stk","price":43},{"name":"Monster Ultra Watermelon 0,5 l","price":23}] \ No newline at end of file diff --git a/data/products.sample.txt b/data/products.sample.txt new file mode 100644 index 0000000..9a23f24 --- /dev/null +++ b/data/products.sample.txt @@ -0,0 +1,4 @@ +https://oda.com/no/products/27923-tulip-ternet-bacon-2x125g/ +https://oda.com/no/products/23492-ringnes-hamar-julebrus-6-x-033l/ +https://oda.com/no/products/28870-prior-frokostegg-fra-frittgaende-honer-str-l/ +https://oda.com/no/products/40760-monster-monster-ultra-watermelon/ \ No newline at end of file