From 14b5cc238350af631f5302f865ec4b6ec12bf385 Mon Sep 17 00:00:00 2001 From: Alex Thomassen Date: Sat, 29 Oct 2022 21:47:05 +0000 Subject: [PATCH] Initial commit --- .env.example | 14 + .gitignore | 2 + LICENSE | 21 + OdaScraper.php | 296 +++++++++ README.md | 16 + composer.json | 8 + composer.lock | 1294 +++++++++++++++++++++++++++++++++++++ data/.gitignore | 3 + data/products.sample.csv | 5 + data/products.sample.json | 1 + data/products.sample.txt | 4 + 11 files changed, 1664 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 OdaScraper.php create mode 100644 README.md create mode 100644 composer.json create mode 100644 composer.lock create mode 100644 data/.gitignore create mode 100644 data/products.sample.csv create mode 100644 data/products.sample.json create mode 100644 data/products.sample.txt diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..f2d31a7 --- /dev/null +++ b/.env.example @@ -0,0 +1,14 @@ +# Relative paths are relative to the current working directory (`pwd`), so explicit full paths are recommended. +# See *.sample.* files for test runs and examples. + +# Simply a list of Oda.com product URLs to scrape. One URL per line. +PRODUCT_URL_LIST=./data/products.txt + +# The output file for the scraped data. This file will be overwritten if it already exists. +PRODUCT_OUTPUT_FILE=./data/products.csv + +# Can be either 'json' or 'csv' +PRODUCT_OUTPUT_FORMAT=json + +# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt. +USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f9eee55 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +vendor +.env \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4d87c6a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2022 Alex Thomassen + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/OdaScraper.php b/OdaScraper.php new file mode 100644 index 0000000..a932496 --- /dev/null +++ b/OdaScraper.php @@ -0,0 +1,296 @@ +load(); +} + +class Oda +{ + /** + * CookieJar for Guzzle + * + * @var GuzzleHttp\Cookie\CookieJar + */ + private CookieJar $cookies; + + /** + * URL to send the initial request (for cookies). + * + * @var string + */ + private string $initUrl = 'https://oda.com/no/products/popular/'; + + /** + * URL for querying the API. + * + * @var string + */ + private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/'; + + /** + * Base HTTP headers + * + * API requires additional headers: + * - Accept: application/json (probably) + * - Cookie + * - X-CSRFToken + * + * @var array + */ + private array $headers = [ + 'Origin' => 'https://oda.com', + 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', + ]; + + /** + * CSRF token used for API requests + * + * @var string + */ + private ?string $csrfToken = null; + + /** + * HTTP Client + * + * @var \GuzzleHttp\Client + */ + private ?HttpClient $client; + + public function __construct() + { + $this->cookies = new CookieJar(); + $this->client = new HttpClient(['cookies' => true]); + + if (!empty($_ENV['USER_AGENT'])) { + $this->headers['User-Agent'] = $_ENV['USER_AGENT']; + } + } + + /** + * Does the initial request to get the CSRF token and cookies. + */ + public function initialize() : void + { + $options = [ + RequestOptions::HEADERS => $this->headers, + RequestOptions::COOKIES => $this->cookies, + ]; + + $this->client->request('GET', $this->initUrl, $options); + $csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null; + + $this->csrfToken = $csrfToken; + } + + /** + * Helper function for sending requests via the HTTP client. + * + * @param string $url + * @param string $method HTTP method (GET, POST, PUT etc.) + * @param array $options Guzzle options + * + * @return string + */ + private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string + { + if (empty($options)) { + $options = [ + RequestOptions::HEADERS => $this->headers, + RequestOptions::COOKIES => $this->cookies, + ]; + } + + $response = $this->client->request($method, $url, $options); + $body = $response->getBody()->getContents(); + return $body; + } + + /** + * Helper function for making products API requests. + * + * @param array $excludeIds + * + * @return array + */ + public function getPopularProducts($excludeIds = []) : array + { + $headers = $this->headers; + $headers['Accept'] = 'application/json'; + $headers['X-CSRFToken'] = $this->csrfToken; + + $options = [ + RequestOptions::HEADERS => $headers, + RequestOptions::COOKIES => $this->cookies, + RequestOptions::JSON => [ + 'exclude_ids' => $excludeIds, + ], + ]; + + $body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options); + + return json_decode($body, true); + } + + /** + * Runs the scraper to scrape popular products + * + * This relies on the Oda API, unlike the scraping. + * + * @return void + */ + public function fetchPopular() : void + { + $this->initialize(); + + $excludeIds = []; + $count = 0; + $products = []; + + $initialRequest = $this->getPopularProducts(); + + $results = $initialRequest['results']; + + $products = array_merge($products, $results); + $productIds = array_column($results, 'id'); + $excludeIds = array_merge($excludeIds, $productIds); + + $hasMore = $initialRequest['has_more']; + + echo sprintf('Scraped %d products', count($products)) . PHP_EOL; + + while ($hasMore) { + $count++; + $request = $this->getPopularProducts($excludeIds); + + $results = $request['results']; + $products = array_merge($products, $results); + + echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL; + + $productIds = array_column($results, 'id'); + $excludeIds = array_merge($excludeIds, $productIds); + + $hasMore = $request['has_more']; + + if (!$hasMore) { + break; + } + } + + $filename = sprintf('%s/data/%s.json', __DIR__, time()); + file_put_contents($filename, json_encode($products)); + } + + /** + * Takes a Oda product page URL and scrapes the product data. + * + * @param string $url + * + * @return array + */ + public function scrapeProductPage(string $url) : array + { + $body = $this->getRequestBody($url); + $crawler = new Crawler($body); + + $name = trim((string) $crawler->filter('h1')->text()); + $price = (float) $crawler->filter('.price')->attr('content'); + + return [ + 'name' => $name, + 'price' => $price, + ]; + } + + /** + * Writes the scraped data to a CSV file. + * + * @param array $products + * + * @return void + */ + private function writeProductsToFile(array $products) : void + { + if (empty($products)) { + return; + } + + $outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT']; + $outputFile = $_ENV['PRODUCT_OUTPUT_FILE']; + + if (empty($outputFormat) || empty($outputFile)) { + echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL; + return; + } + + if ($outputFormat === 'json') { + file_put_contents($outputFile, json_encode($products)); + return; + } + + if ($outputFormat === 'csv') { + $handle = fopen($outputFile, 'w'); + + // Add the first row as headers. Assuming all products are formatted the same, of course. + $columns = array_keys($products[0]); + fputcsv($handle, $columns); + + foreach ($products as $product) + { + fputcsv($handle, $product); + } + + fclose($handle); + return; + } + + echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL); + } + + /** + * Scrapes products from the specified `PRODUCT_URL_LIST` file. + * Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future. + * + * Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format. + * + * @return array + */ + public function scrapeFromFile() : array + { + $filename = $_ENV['PRODUCT_URL_LIST']; + + if (empty($filename)) { + echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL; + return []; + } + + $inputHandle = fopen($filename, 'r'); + $products = []; + + while (($line = fgets($inputHandle)) !== false) { + $url = trim($line); + $product = $this->scrapeProductPage($url); + $products[] = $product; + } + + fclose($inputHandle); + + $this->writeProductsToFile($products); + return $products; + } +} + +$oda = new Oda(); +$oda->scrapeFromFile(); \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..99364f9 --- /dev/null +++ b/README.md @@ -0,0 +1,16 @@ +# Scraper for Oda.com + +## Requirements + +- PHP 8.1 (cli) +- PHP extension: `php-xml` +- PHP extension: `php-curl` +- [`Composer`](https://getcomposer.org/) + - Other PHP extensions may also be necessary to install. Composer will likely complain when you run `composer install`. + +## Setup + +- Copy `.env.example` to `.env` and fill in the values as you see fit. +- Run `composer install` to install dependencies. +- Run `php OdaScraper.php` to scrape the products. +- Script something to send the output files to wherever you want? \ No newline at end of file diff --git a/composer.json b/composer.json new file mode 100644 index 0000000..9fb76b2 --- /dev/null +++ b/composer.json @@ -0,0 +1,8 @@ +{ + "require": { + "guzzlehttp/guzzle": "^7.5", + "symfony/dom-crawler": "^6.1", + "symfony/css-selector": "^6.1", + "vlucas/phpdotenv": "^5.5" + } +} diff --git a/composer.lock b/composer.lock new file mode 100644 index 0000000..967e551 --- /dev/null +++ b/composer.lock @@ -0,0 +1,1294 @@ +{ + "_readme": [ + "This file locks the dependencies of your project to a known state", + "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", + "This file is @generated automatically" + ], + "content-hash": "e596a64ebcd46e365fa53285412a5a5f", + "packages": [ + { + "name": "graham-campbell/result-type", + "version": "v1.1.0", + "source": { + "type": "git", + "url": "https://github.com/GrahamCampbell/Result-Type.git", + "reference": "a878d45c1914464426dc94da61c9e1d36ae262a8" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/GrahamCampbell/Result-Type/zipball/a878d45c1914464426dc94da61c9e1d36ae262a8", + "reference": "a878d45c1914464426dc94da61c9e1d36ae262a8", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0", + "phpoption/phpoption": "^1.9" + }, + "require-dev": { + "phpunit/phpunit": "^8.5.28 || ^9.5.21" + }, + "type": "library", + "autoload": { + "psr-4": { + "GrahamCampbell\\ResultType\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + } + ], + "description": "An Implementation Of The Result Type", + "keywords": [ + "Graham Campbell", + "GrahamCampbell", + "Result Type", + "Result-Type", + "result" + ], + "support": { + "issues": "https://github.com/GrahamCampbell/Result-Type/issues", + "source": "https://github.com/GrahamCampbell/Result-Type/tree/v1.1.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/graham-campbell/result-type", + "type": "tidelift" + } + ], + "time": "2022-07-30T15:56:11+00:00" + }, + { + "name": "guzzlehttp/guzzle", + "version": "7.5.0", + "source": { + "type": "git", + "url": "https://github.com/guzzle/guzzle.git", + "reference": "b50a2a1251152e43f6a37f0fa053e730a67d25ba" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/guzzle/zipball/b50a2a1251152e43f6a37f0fa053e730a67d25ba", + "reference": "b50a2a1251152e43f6a37f0fa053e730a67d25ba", + "shasum": "" + }, + "require": { + "ext-json": "*", + "guzzlehttp/promises": "^1.5", + "guzzlehttp/psr7": "^1.9 || ^2.4", + "php": "^7.2.5 || ^8.0", + "psr/http-client": "^1.0", + "symfony/deprecation-contracts": "^2.2 || ^3.0" + }, + "provide": { + "psr/http-client-implementation": "1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.1", + "ext-curl": "*", + "php-http/client-integration-tests": "^3.0", + "phpunit/phpunit": "^8.5.29 || ^9.5.23", + "psr/log": "^1.1 || ^2.0 || ^3.0" + }, + "suggest": { + "ext-curl": "Required for CURL handler support", + "ext-intl": "Required for Internationalized Domain Name (IDN) support", + "psr/log": "Required for using the Log middleware" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + }, + "branch-alias": { + "dev-master": "7.5-dev" + } + }, + "autoload": { + "files": [ + "src/functions_include.php" + ], + "psr-4": { + "GuzzleHttp\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "Jeremy Lindblom", + "email": "jeremeamia@gmail.com", + "homepage": "https://github.com/jeremeamia" + }, + { + "name": "George Mponos", + "email": "gmponos@gmail.com", + "homepage": "https://github.com/gmponos" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://github.com/sagikazarmark" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + } + ], + "description": "Guzzle is a PHP HTTP client library", + "keywords": [ + "client", + "curl", + "framework", + "http", + "http client", + "psr-18", + "psr-7", + "rest", + "web service" + ], + "support": { + "issues": "https://github.com/guzzle/guzzle/issues", + "source": "https://github.com/guzzle/guzzle/tree/7.5.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/guzzle", + "type": "tidelift" + } + ], + "time": "2022-08-28T15:39:27+00:00" + }, + { + "name": "guzzlehttp/promises", + "version": "1.5.2", + "source": { + "type": "git", + "url": "https://github.com/guzzle/promises.git", + "reference": "b94b2807d85443f9719887892882d0329d1e2598" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/promises/zipball/b94b2807d85443f9719887892882d0329d1e2598", + "reference": "b94b2807d85443f9719887892882d0329d1e2598", + "shasum": "" + }, + "require": { + "php": ">=5.5" + }, + "require-dev": { + "symfony/phpunit-bridge": "^4.4 || ^5.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.5-dev" + } + }, + "autoload": { + "files": [ + "src/functions_include.php" + ], + "psr-4": { + "GuzzleHttp\\Promise\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + } + ], + "description": "Guzzle promises library", + "keywords": [ + "promise" + ], + "support": { + "issues": "https://github.com/guzzle/promises/issues", + "source": "https://github.com/guzzle/promises/tree/1.5.2" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/promises", + "type": "tidelift" + } + ], + "time": "2022-08-28T14:55:35+00:00" + }, + { + "name": "guzzlehttp/psr7", + "version": "2.4.3", + "source": { + "type": "git", + "url": "https://github.com/guzzle/psr7.git", + "reference": "67c26b443f348a51926030c83481b85718457d3d" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/guzzle/psr7/zipball/67c26b443f348a51926030c83481b85718457d3d", + "reference": "67c26b443f348a51926030c83481b85718457d3d", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0", + "psr/http-factory": "^1.0", + "psr/http-message": "^1.0", + "ralouphie/getallheaders": "^3.0" + }, + "provide": { + "psr/http-factory-implementation": "1.0", + "psr/http-message-implementation": "1.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8.1", + "http-interop/http-factory-tests": "^0.9", + "phpunit/phpunit": "^8.5.29 || ^9.5.23" + }, + "suggest": { + "laminas/laminas-httphandlerrunner": "Emit PSR-7 responses" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": false + }, + "branch-alias": { + "dev-master": "2.4-dev" + } + }, + "autoload": { + "psr-4": { + "GuzzleHttp\\Psr7\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Michael Dowling", + "email": "mtdowling@gmail.com", + "homepage": "https://github.com/mtdowling" + }, + { + "name": "George Mponos", + "email": "gmponos@gmail.com", + "homepage": "https://github.com/gmponos" + }, + { + "name": "Tobias Nyholm", + "email": "tobias.nyholm@gmail.com", + "homepage": "https://github.com/Nyholm" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://github.com/sagikazarmark" + }, + { + "name": "Tobias Schultze", + "email": "webmaster@tubo-world.de", + "homepage": "https://github.com/Tobion" + }, + { + "name": "Márk Sági-Kazár", + "email": "mark.sagikazar@gmail.com", + "homepage": "https://sagikazarmark.hu" + } + ], + "description": "PSR-7 message implementation that also provides common utility methods", + "keywords": [ + "http", + "message", + "psr-7", + "request", + "response", + "stream", + "uri", + "url" + ], + "support": { + "issues": "https://github.com/guzzle/psr7/issues", + "source": "https://github.com/guzzle/psr7/tree/2.4.3" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://github.com/Nyholm", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/guzzlehttp/psr7", + "type": "tidelift" + } + ], + "time": "2022-10-26T14:07:24+00:00" + }, + { + "name": "masterminds/html5", + "version": "2.7.6", + "source": { + "type": "git", + "url": "https://github.com/Masterminds/html5-php.git", + "reference": "897eb517a343a2281f11bc5556d6548db7d93947" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/Masterminds/html5-php/zipball/897eb517a343a2281f11bc5556d6548db7d93947", + "reference": "897eb517a343a2281f11bc5556d6548db7d93947", + "shasum": "" + }, + "require": { + "ext-ctype": "*", + "ext-dom": "*", + "ext-libxml": "*", + "php": ">=5.3.0" + }, + "require-dev": { + "phpunit/phpunit": "^4.8.35 || ^5.7.21 || ^6 || ^7" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "2.7-dev" + } + }, + "autoload": { + "psr-4": { + "Masterminds\\": "src" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Matt Butcher", + "email": "technosophos@gmail.com" + }, + { + "name": "Matt Farina", + "email": "matt@mattfarina.com" + }, + { + "name": "Asmir Mustafic", + "email": "goetas@gmail.com" + } + ], + "description": "An HTML5 parser and serializer.", + "homepage": "http://masterminds.github.io/html5-php", + "keywords": [ + "HTML5", + "dom", + "html", + "parser", + "querypath", + "serializer", + "xml" + ], + "support": { + "issues": "https://github.com/Masterminds/html5-php/issues", + "source": "https://github.com/Masterminds/html5-php/tree/2.7.6" + }, + "time": "2022-08-18T16:18:26+00:00" + }, + { + "name": "phpoption/phpoption", + "version": "1.9.0", + "source": { + "type": "git", + "url": "https://github.com/schmittjoh/php-option.git", + "reference": "dc5ff11e274a90cc1c743f66c9ad700ce50db9ab" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/schmittjoh/php-option/zipball/dc5ff11e274a90cc1c743f66c9ad700ce50db9ab", + "reference": "dc5ff11e274a90cc1c743f66c9ad700ce50db9ab", + "shasum": "" + }, + "require": { + "php": "^7.2.5 || ^8.0" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.8", + "phpunit/phpunit": "^8.5.28 || ^9.5.21" + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": true + }, + "branch-alias": { + "dev-master": "1.9-dev" + } + }, + "autoload": { + "psr-4": { + "PhpOption\\": "src/PhpOption/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "Apache-2.0" + ], + "authors": [ + { + "name": "Johannes M. Schmitt", + "email": "schmittjoh@gmail.com", + "homepage": "https://github.com/schmittjoh" + }, + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + } + ], + "description": "Option Type for PHP", + "keywords": [ + "language", + "option", + "php", + "type" + ], + "support": { + "issues": "https://github.com/schmittjoh/php-option/issues", + "source": "https://github.com/schmittjoh/php-option/tree/1.9.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/phpoption/phpoption", + "type": "tidelift" + } + ], + "time": "2022-07-30T15:51:26+00:00" + }, + { + "name": "psr/http-client", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-client.git", + "reference": "2dfb5f6c5eff0e91e20e913f8c5452ed95b86621" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-client/zipball/2dfb5f6c5eff0e91e20e913f8c5452ed95b86621", + "reference": "2dfb5f6c5eff0e91e20e913f8c5452ed95b86621", + "shasum": "" + }, + "require": { + "php": "^7.0 || ^8.0", + "psr/http-message": "^1.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Client\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interface for HTTP clients", + "homepage": "https://github.com/php-fig/http-client", + "keywords": [ + "http", + "http-client", + "psr", + "psr-18" + ], + "support": { + "source": "https://github.com/php-fig/http-client/tree/master" + }, + "time": "2020-06-29T06:28:15+00:00" + }, + { + "name": "psr/http-factory", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-factory.git", + "reference": "12ac7fcd07e5b077433f5f2bee95b3a771bf61be" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-factory/zipball/12ac7fcd07e5b077433f5f2bee95b3a771bf61be", + "reference": "12ac7fcd07e5b077433f5f2bee95b3a771bf61be", + "shasum": "" + }, + "require": { + "php": ">=7.0.0", + "psr/http-message": "^1.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interfaces for PSR-7 HTTP message factories", + "keywords": [ + "factory", + "http", + "message", + "psr", + "psr-17", + "psr-7", + "request", + "response" + ], + "support": { + "source": "https://github.com/php-fig/http-factory/tree/master" + }, + "time": "2019-04-30T12:38:16+00:00" + }, + { + "name": "psr/http-message", + "version": "1.0.1", + "source": { + "type": "git", + "url": "https://github.com/php-fig/http-message.git", + "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/php-fig/http-message/zipball/f6561bf28d520154e4b0ec72be95418abe6d9363", + "reference": "f6561bf28d520154e4b0ec72be95418abe6d9363", + "shasum": "" + }, + "require": { + "php": ">=5.3.0" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-master": "1.0.x-dev" + } + }, + "autoload": { + "psr-4": { + "Psr\\Http\\Message\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "PHP-FIG", + "homepage": "http://www.php-fig.org/" + } + ], + "description": "Common interface for HTTP messages", + "homepage": "https://github.com/php-fig/http-message", + "keywords": [ + "http", + "http-message", + "psr", + "psr-7", + "request", + "response" + ], + "support": { + "source": "https://github.com/php-fig/http-message/tree/master" + }, + "time": "2016-08-06T14:39:51+00:00" + }, + { + "name": "ralouphie/getallheaders", + "version": "3.0.3", + "source": { + "type": "git", + "url": "https://github.com/ralouphie/getallheaders.git", + "reference": "120b605dfeb996808c31b6477290a714d356e822" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/ralouphie/getallheaders/zipball/120b605dfeb996808c31b6477290a714d356e822", + "reference": "120b605dfeb996808c31b6477290a714d356e822", + "shasum": "" + }, + "require": { + "php": ">=5.6" + }, + "require-dev": { + "php-coveralls/php-coveralls": "^2.1", + "phpunit/phpunit": "^5 || ^6.5" + }, + "type": "library", + "autoload": { + "files": [ + "src/getallheaders.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Ralph Khattar", + "email": "ralph.khattar@gmail.com" + } + ], + "description": "A polyfill for getallheaders.", + "support": { + "issues": "https://github.com/ralouphie/getallheaders/issues", + "source": "https://github.com/ralouphie/getallheaders/tree/develop" + }, + "time": "2019-03-08T08:55:37+00:00" + }, + { + "name": "symfony/css-selector", + "version": "v6.1.3", + "source": { + "type": "git", + "url": "https://github.com/symfony/css-selector.git", + "reference": "0dd5e36b80e1de97f8f74ed7023ac2b837a36443" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/css-selector/zipball/0dd5e36b80e1de97f8f74ed7023ac2b837a36443", + "reference": "0dd5e36b80e1de97f8f74ed7023ac2b837a36443", + "shasum": "" + }, + "require": { + "php": ">=8.1" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\CssSelector\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Jean-François Simon", + "email": "jeanfrancois.simon@sensiolabs.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Converts CSS selectors to XPath expressions", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/css-selector/tree/v6.1.3" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-06-27T17:24:16+00:00" + }, + { + "name": "symfony/deprecation-contracts", + "version": "v3.1.1", + "source": { + "type": "git", + "url": "https://github.com/symfony/deprecation-contracts.git", + "reference": "07f1b9cc2ffee6aaafcf4b710fbc38ff736bd918" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/deprecation-contracts/zipball/07f1b9cc2ffee6aaafcf4b710fbc38ff736bd918", + "reference": "07f1b9cc2ffee6aaafcf4b710fbc38ff736bd918", + "shasum": "" + }, + "require": { + "php": ">=8.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "3.1-dev" + }, + "thanks": { + "name": "symfony/contracts", + "url": "https://github.com/symfony/contracts" + } + }, + "autoload": { + "files": [ + "function.php" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "A generic function and convention to trigger deprecation notices", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/deprecation-contracts/tree/v3.1.1" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-02-25T11:15:52+00:00" + }, + { + "name": "symfony/dom-crawler", + "version": "v6.1.4", + "source": { + "type": "git", + "url": "https://github.com/symfony/dom-crawler.git", + "reference": "8cb4c6e6c8d30c26f70529ed5e50d79a09576c0c" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/dom-crawler/zipball/8cb4c6e6c8d30c26f70529ed5e50d79a09576c0c", + "reference": "8cb4c6e6c8d30c26f70529ed5e50d79a09576c0c", + "shasum": "" + }, + "require": { + "masterminds/html5": "^2.6", + "php": ">=8.1", + "symfony/polyfill-ctype": "~1.8", + "symfony/polyfill-mbstring": "~1.0" + }, + "require-dev": { + "symfony/css-selector": "^5.4|^6.0" + }, + "suggest": { + "symfony/css-selector": "" + }, + "type": "library", + "autoload": { + "psr-4": { + "Symfony\\Component\\DomCrawler\\": "" + }, + "exclude-from-classmap": [ + "/Tests/" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Fabien Potencier", + "email": "fabien@symfony.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Eases DOM navigation for HTML and XML documents", + "homepage": "https://symfony.com", + "support": { + "source": "https://github.com/symfony/dom-crawler/tree/v6.1.4" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-08-04T19:19:00+00:00" + }, + { + "name": "symfony/polyfill-ctype", + "version": "v1.26.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-ctype.git", + "reference": "6fd1b9a79f6e3cf65f9e679b23af304cd9e010d4" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-ctype/zipball/6fd1b9a79f6e3cf65f9e679b23af304cd9e010d4", + "reference": "6fd1b9a79f6e3cf65f9e679b23af304cd9e010d4", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "provide": { + "ext-ctype": "*" + }, + "suggest": { + "ext-ctype": "For best performance" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "1.26-dev" + }, + "thanks": { + "name": "symfony/polyfill", + "url": "https://github.com/symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Ctype\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Gert de Pagter", + "email": "BackEndTea@gmail.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for ctype functions", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "ctype", + "polyfill", + "portable" + ], + "support": { + "source": "https://github.com/symfony/polyfill-ctype/tree/v1.26.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-05-24T11:49:31+00:00" + }, + { + "name": "symfony/polyfill-mbstring", + "version": "v1.26.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-mbstring.git", + "reference": "9344f9cb97f3b19424af1a21a3b0e75b0a7d8d7e" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-mbstring/zipball/9344f9cb97f3b19424af1a21a3b0e75b0a7d8d7e", + "reference": "9344f9cb97f3b19424af1a21a3b0e75b0a7d8d7e", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "provide": { + "ext-mbstring": "*" + }, + "suggest": { + "ext-mbstring": "For best performance" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "1.26-dev" + }, + "thanks": { + "name": "symfony/polyfill", + "url": "https://github.com/symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Mbstring\\": "" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill for the Mbstring extension", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "mbstring", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-mbstring/tree/v1.26.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-05-24T11:49:31+00:00" + }, + { + "name": "symfony/polyfill-php80", + "version": "v1.26.0", + "source": { + "type": "git", + "url": "https://github.com/symfony/polyfill-php80.git", + "reference": "cfa0ae98841b9e461207c13ab093d76b0fa7bace" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/symfony/polyfill-php80/zipball/cfa0ae98841b9e461207c13ab093d76b0fa7bace", + "reference": "cfa0ae98841b9e461207c13ab093d76b0fa7bace", + "shasum": "" + }, + "require": { + "php": ">=7.1" + }, + "type": "library", + "extra": { + "branch-alias": { + "dev-main": "1.26-dev" + }, + "thanks": { + "name": "symfony/polyfill", + "url": "https://github.com/symfony/polyfill" + } + }, + "autoload": { + "files": [ + "bootstrap.php" + ], + "psr-4": { + "Symfony\\Polyfill\\Php80\\": "" + }, + "classmap": [ + "Resources/stubs" + ] + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "MIT" + ], + "authors": [ + { + "name": "Ion Bazan", + "email": "ion.bazan@gmail.com" + }, + { + "name": "Nicolas Grekas", + "email": "p@tchwork.com" + }, + { + "name": "Symfony Community", + "homepage": "https://symfony.com/contributors" + } + ], + "description": "Symfony polyfill backporting some PHP 8.0+ features to lower PHP versions", + "homepage": "https://symfony.com", + "keywords": [ + "compatibility", + "polyfill", + "portable", + "shim" + ], + "support": { + "source": "https://github.com/symfony/polyfill-php80/tree/v1.26.0" + }, + "funding": [ + { + "url": "https://symfony.com/sponsor", + "type": "custom" + }, + { + "url": "https://github.com/fabpot", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/symfony/symfony", + "type": "tidelift" + } + ], + "time": "2022-05-10T07:21:04+00:00" + }, + { + "name": "vlucas/phpdotenv", + "version": "v5.5.0", + "source": { + "type": "git", + "url": "https://github.com/vlucas/phpdotenv.git", + "reference": "1a7ea2afc49c3ee6d87061f5a233e3a035d0eae7" + }, + "dist": { + "type": "zip", + "url": "https://api.github.com/repos/vlucas/phpdotenv/zipball/1a7ea2afc49c3ee6d87061f5a233e3a035d0eae7", + "reference": "1a7ea2afc49c3ee6d87061f5a233e3a035d0eae7", + "shasum": "" + }, + "require": { + "ext-pcre": "*", + "graham-campbell/result-type": "^1.0.2", + "php": "^7.1.3 || ^8.0", + "phpoption/phpoption": "^1.8", + "symfony/polyfill-ctype": "^1.23", + "symfony/polyfill-mbstring": "^1.23.1", + "symfony/polyfill-php80": "^1.23.1" + }, + "require-dev": { + "bamarni/composer-bin-plugin": "^1.4.1", + "ext-filter": "*", + "phpunit/phpunit": "^7.5.20 || ^8.5.30 || ^9.5.25" + }, + "suggest": { + "ext-filter": "Required to use the boolean validator." + }, + "type": "library", + "extra": { + "bamarni-bin": { + "bin-links": true, + "forward-command": true + }, + "branch-alias": { + "dev-master": "5.5-dev" + } + }, + "autoload": { + "psr-4": { + "Dotenv\\": "src/" + } + }, + "notification-url": "https://packagist.org/downloads/", + "license": [ + "BSD-3-Clause" + ], + "authors": [ + { + "name": "Graham Campbell", + "email": "hello@gjcampbell.co.uk", + "homepage": "https://github.com/GrahamCampbell" + }, + { + "name": "Vance Lucas", + "email": "vance@vancelucas.com", + "homepage": "https://github.com/vlucas" + } + ], + "description": "Loads environment variables from `.env` to `getenv()`, `$_ENV` and `$_SERVER` automagically.", + "keywords": [ + "dotenv", + "env", + "environment" + ], + "support": { + "issues": "https://github.com/vlucas/phpdotenv/issues", + "source": "https://github.com/vlucas/phpdotenv/tree/v5.5.0" + }, + "funding": [ + { + "url": "https://github.com/GrahamCampbell", + "type": "github" + }, + { + "url": "https://tidelift.com/funding/github/packagist/vlucas/phpdotenv", + "type": "tidelift" + } + ], + "time": "2022-10-16T01:01:54+00:00" + } + ], + "packages-dev": [], + "aliases": [], + "minimum-stability": "stable", + "stability-flags": [], + "prefer-stable": false, + "prefer-lowest": false, + "platform": [], + "platform-dev": [], + "plugin-api-version": "2.3.0" +} diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ea1e8a3 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1,3 @@ +* +!.gitignore +!*.sample.* \ No newline at end of file diff --git a/data/products.sample.csv b/data/products.sample.csv new file mode 100644 index 0000000..c112dbf --- /dev/null +++ b/data/products.sample.csv @@ -0,0 +1,5 @@ +name,price +"Ternet Bacon 2x125g, 250 g",45.7 +"Hamar Julebrus 6 x 0,33l, 1,98 l",128.4 +"Frokostegg Fra Frittgående Høner Str L, 12 stk",43 +"Monster Ultra Watermelon 0,5 l",23 diff --git a/data/products.sample.json b/data/products.sample.json new file mode 100644 index 0000000..5543cbf --- /dev/null +++ b/data/products.sample.json @@ -0,0 +1 @@ +[{"name":"Ternet Bacon 2x125g, 250 g","price":45.7},{"name":"Hamar Julebrus 6 x 0,33l, 1,98 l","price":128.4},{"name":"Frokostegg Fra Frittg\u00e5ende H\u00f8ner Str L, 12 stk","price":43},{"name":"Monster Ultra Watermelon 0,5 l","price":23}] \ No newline at end of file diff --git a/data/products.sample.txt b/data/products.sample.txt new file mode 100644 index 0000000..9a23f24 --- /dev/null +++ b/data/products.sample.txt @@ -0,0 +1,4 @@ +https://oda.com/no/products/27923-tulip-ternet-bacon-2x125g/ +https://oda.com/no/products/23492-ringnes-hamar-julebrus-6-x-033l/ +https://oda.com/no/products/28870-prior-frokostegg-fra-frittgaende-honer-str-l/ +https://oda.com/no/products/40760-monster-monster-ultra-watermelon/ \ No newline at end of file