load(); } class Oda { /** * CookieJar for Guzzle * * @var GuzzleHttp\Cookie\CookieJar */ private CookieJar $cookies; /** * URL to send the initial request (for cookies). * * @var string */ private string $initUrl = 'https://oda.com/no/products/popular/'; /** * URL for querying the API. * * @var string */ private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/'; /** * Base HTTP headers * * API requires additional headers: * - Accept: application/json (probably) * - Cookie * - X-CSRFToken * * @var array */ private array $headers = [ 'Origin' => 'https://oda.com', 'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36', ]; /** * CSRF token used for API requests * * @var string */ private ?string $csrfToken = null; /** * HTTP Client * * @var \GuzzleHttp\Client */ private ?HttpClient $client; public function __construct() { $this->cookies = new CookieJar(); $this->client = new HttpClient(['cookies' => true]); if (!empty($_ENV['USER_AGENT'])) { $this->headers['User-Agent'] = $_ENV['USER_AGENT']; } } /** * Does the initial request to get the CSRF token and cookies. */ public function initialize() : void { $options = [ RequestOptions::HEADERS => $this->headers, RequestOptions::COOKIES => $this->cookies, ]; $this->client->request('GET', $this->initUrl, $options); $csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null; $this->csrfToken = $csrfToken; } /** * Helper function for sending requests via the HTTP client. * * @param string $url * @param string $method HTTP method (GET, POST, PUT etc.) * @param array $options Guzzle options * * @return string */ private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string { if (empty($options)) { $options = [ RequestOptions::HEADERS => $this->headers, RequestOptions::COOKIES => $this->cookies, ]; } $response = $this->client->request($method, $url, $options); $body = $response->getBody()->getContents(); return $body; } /** * Helper function for making products API requests. * * @param array $excludeIds * * @return array */ public function getPopularProducts($excludeIds = []) : array { $headers = $this->headers; $headers['Accept'] = 'application/json'; $headers['X-CSRFToken'] = $this->csrfToken; $options = [ RequestOptions::HEADERS => $headers, RequestOptions::COOKIES => $this->cookies, RequestOptions::JSON => [ 'exclude_ids' => $excludeIds, ], ]; $body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options); return json_decode($body, true); } /** * Runs the scraper to scrape popular products * * This relies on the Oda API, unlike the scraping. * * @return void */ public function fetchPopular() : void { $this->initialize(); $excludeIds = []; $count = 0; $products = []; $initialRequest = $this->getPopularProducts(); $results = $initialRequest['results']; $products = array_merge($products, $results); $productIds = array_column($results, 'id'); $excludeIds = array_merge($excludeIds, $productIds); $hasMore = $initialRequest['has_more']; echo sprintf('Scraped %d products', count($products)) . PHP_EOL; while ($hasMore) { $count++; $request = $this->getPopularProducts($excludeIds); $results = $request['results']; $products = array_merge($products, $results); echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL; $productIds = array_column($results, 'id'); $excludeIds = array_merge($excludeIds, $productIds); $hasMore = $request['has_more']; if (!$hasMore) { break; } } $filename = sprintf('%s/data/%s.json', __DIR__, time()); file_put_contents($filename, json_encode($products)); } /** * Takes a Oda product page URL and scrapes the product data. * * @param string $url * * @return array */ public function scrapeProductPage(string $url) : array { $body = $this->getRequestBody($url); $crawler = new Crawler($body); $name = trim((string) $crawler->filter('h1')->text()); $price = (float) $crawler->filter('.price')->attr('content'); /** * Unit price handler. * Example: "kr 187,50 per unit" */ $priceUnitText = trim((string) $crawler->filter('.unit-price')->text()); // Remove redundant text $priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText); /** * Split the price and unit. * * In theory the results should be something like: * - [0] => 187.50 * - [1] => unit */ $priceUnit = explode(' ', trim($priceUnitText)); return [ 'name' => $name, 'price' => $price, 'unit_price' => (float) $priceUnit[0], 'unit' => $priceUnit[1], ]; } /** * Writes the scraped data to a CSV & a JSON file. * * @param array $products * * @return void */ private function writeProductsToFile(array $products) : void { if (empty($products)) { return; } $outputFile = $_ENV['PRODUCT_OUTPUT_FILE']; if (empty($outputFile)) { echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL; return; } // Output JSON $jsonFile = str_replace('.csv', '.json', $outputFile); file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT)); $csvFile = str_replace('.json', '.csv', $jsonFile); $handle = fopen($csvFile, 'w'); // Add the first row as headers. Assuming all products are formatted the same, of course. $csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';'; $columns = array_keys($products[0]); fputcsv($handle, $columns, $csvSeparator); foreach ($products as $product) { /** * Special handler for arrays inside the product array. * To retain the data, we 'convert' the array to a string. */ foreach ($product as $subIdx => $subItem) { if (is_array($subItem)) { $product[$subIdx] = implode(',', $subItem); } } fputcsv($handle, $product, $csvSeparator); } fclose($handle); } /** * Scrapes products from the specified `PRODUCT_URL_LIST` file. * Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future. * * Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format. * * @return array */ public function scrapeFromFile() : array { $filename = $_ENV['PRODUCT_URL_LIST']; if (empty($filename)) { echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL; return []; } $inputHandle = fopen($filename, 'r'); $products = []; while (($line = fgets($inputHandle)) !== false) { $url = trim($line); $product = $this->scrapeProductPage($url); $products[] = $product; } fclose($inputHandle); $this->writeProductsToFile($products); return $products; } } $oda = new Oda(); $oda->scrapeFromFile();