load(); } class Oda { /** * CookieJar for Guzzle * * @var GuzzleHttp\Cookie\CookieJar */ private CookieJar $cookies; /** * URL to send the initial request (for cookies). * * @var string */ private string $initUrl = 'https://oda.com/no/products/popular/'; /** * URL for querying the API. * * @var string */ private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/'; /** * Base HTTP headers * * API requires additional headers: * - Accept: application/json (probably) * - Cookie * - X-CSRFToken * * @var array */ private array $headers = [ 'Origin' => 'https://oda.com', 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', ]; /** * CSRF token used for API requests * * @var string */ private ?string $csrfToken = null; /** * HTTP Client * * @var \GuzzleHttp\Client */ private ?HttpClient $client; public function __construct() { $this->cookies = new CookieJar(); $this->client = new HttpClient(['cookies' => true]); if (!empty($_ENV['USER_AGENT'])) { $this->headers['User-Agent'] = $_ENV['USER_AGENT']; } } /** * Does the initial request to get the CSRF token and cookies. */ public function initialize() : void { $options = [ RequestOptions::HEADERS => $this->headers, RequestOptions::COOKIES => $this->cookies, ]; $this->client->request('GET', $this->initUrl, $options); $csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null; $this->csrfToken = $csrfToken; } /** * Helper function for sending requests via the HTTP client. * * @param string $url * @param string $method HTTP method (GET, POST, PUT etc.) * @param array $options Guzzle options * * @return string */ private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string { if (empty($options)) { $options = [ RequestOptions::HEADERS => $this->headers, RequestOptions::COOKIES => $this->cookies, ]; } $response = $this->client->request($method, $url, $options); $body = $response->getBody()->getContents(); return $body; } /** * Helper function for making products API requests. * * @param array $excludeIds * * @return array */ public function getPopularProducts($excludeIds = []) : array { $headers = $this->headers; $headers['Accept'] = 'application/json'; $headers['X-CSRFToken'] = $this->csrfToken; $options = [ RequestOptions::HEADERS => $headers, RequestOptions::COOKIES => $this->cookies, RequestOptions::JSON => [ 'exclude_ids' => $excludeIds, ], ]; $body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options); return json_decode($body, true); } /** * Runs the scraper to scrape popular products * * This relies on the Oda API, unlike the scraping. * * @return void */ public function fetchPopular() : void { $this->initialize(); $excludeIds = []; $count = 0; $products = []; $initialRequest = $this->getPopularProducts(); $results = $initialRequest['results']; $products = array_merge($products, $results); $productIds = array_column($results, 'id'); $excludeIds = array_merge($excludeIds, $productIds); $hasMore = $initialRequest['has_more']; echo sprintf('Scraped %d products', count($products)) . PHP_EOL; while ($hasMore) { $count++; $request = $this->getPopularProducts($excludeIds); $results = $request['results']; $products = array_merge($products, $results); echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL; $productIds = array_column($results, 'id'); $excludeIds = array_merge($excludeIds, $productIds); $hasMore = $request['has_more']; if (!$hasMore) { break; } } $filename = sprintf('%s/data/%s.json', __DIR__, time()); file_put_contents($filename, json_encode($products)); } /** * Takes a Oda product page URL and scrapes the product data. * * @param string $url * * @return array */ public function scrapeProductPage(string $url) : array { $body = $this->getRequestBody($url); $crawler = new Crawler($body); $name = trim((string) $crawler->filter('h1')->text()); $price = (float) $crawler->filter('.price')->attr('content'); $price_unit_string = trim((string) $crawler->filter('.unit-price')->text()); ## Møkkete datavask på string begynn kr 50,50 per unit $price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit $price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit $unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit return [ 'name' => $name, 'price' => $price, 'unit_type' => $unit_data, ]; } /** * Writes the scraped data to a CSV file. * * @param array $products * * @return void */ private function writeProductsToFile(array $products) : void { if (empty($products)) { return; } $outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT']; $outputFile = $_ENV['PRODUCT_OUTPUT_FILE']; if (empty($outputFormat) || empty($outputFile)) { echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL; return; } if ($outputFormat === 'json') { file_put_contents($outputFile, json_encode($products)); return; } if ($outputFormat === 'csv') { $handle = fopen($outputFile, 'w'); // Add the first row as headers. Assuming all products are formatted the same, of course. $columns = array_keys($products[0]); fputcsv($handle, $columns); foreach ($products as $product) { fputcsv($handle, $product); } fclose($handle); return; } echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL); } /** * Scrapes products from the specified `PRODUCT_URL_LIST` file. * Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future. * * Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format. * * @return array */ public function scrapeFromFile() : array { $filename = $_ENV['PRODUCT_URL_LIST']; if (empty($filename)) { echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL; return []; } $inputHandle = fopen($filename, 'r'); $products = []; while (($line = fgets($inputHandle)) !== false) { $url = trim($line); $product = $this->scrapeProductPage($url); $products[] = $product; } fclose($inputHandle); $this->writeProductsToFile($products); return $products; } } $oda = new Oda(); $oda->scrapeFromFile();