Oda/OdaScraper.php

324 lines
8.6 KiB
PHP

<?php
namespace Decicus\Scraper;
require __DIR__ . '/vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
use GuzzleHttp\Cookie\CookieJar;
use GuzzleHttp\Client as HttpClient;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\RequestOptions;
if (file_exists(__DIR__ . '/.env')) {
/**
* Make `.env` variables via getenv().
*/
$env = \Dotenv\Dotenv::createImmutable(__DIR__, '.env');
$env->load();
}
class Oda
{
/**
* CookieJar for Guzzle
*
* @var GuzzleHttp\Cookie\CookieJar
*/
private CookieJar $cookies;
/**
* URL to send the initial request (for cookies).
*
* @var string
*/
private string $initUrl = 'https://oda.com/no/products/popular/';
/**
* URL for querying the API.
*
* @var string
*/
private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/';
/**
* Base HTTP headers
*
* API requires additional headers:
* - Accept: application/json (probably)
* - Cookie
* - X-CSRFToken
*
* @var array
*/
private array $headers = [
'Origin' => 'https://oda.com',
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
];
/**
* CSRF token used for API requests
*
* @var string
*/
private ?string $csrfToken = null;
/**
* HTTP Client
*
* @var \GuzzleHttp\Client
*/
private ?HttpClient $client;
public function __construct()
{
$this->cookies = new CookieJar();
$this->client = new HttpClient(['cookies' => true]);
if (!empty($_ENV['USER_AGENT'])) {
$this->headers['User-Agent'] = $_ENV['USER_AGENT'];
}
}
/**
* Does the initial request to get the CSRF token and cookies.
*/
public function initialize() : void
{
$options = [
RequestOptions::HEADERS => $this->headers,
RequestOptions::COOKIES => $this->cookies,
];
$this->client->request('GET', $this->initUrl, $options);
$csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null;
$this->csrfToken = $csrfToken;
}
/**
* Helper function for sending requests via the HTTP client.
*
* @param string $url
* @param string $method HTTP method (GET, POST, PUT etc.)
* @param array $options Guzzle options
*
* @return string
*/
private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string
{
if (empty($options)) {
$options = [
RequestOptions::HEADERS => $this->headers,
RequestOptions::COOKIES => $this->cookies,
];
}
$response = $this->client->request($method, $url, $options);
$body = $response->getBody()->getContents();
return $body;
}
/**
* Helper function for making products API requests.
*
* @param array $excludeIds
*
* @return array
*/
public function getPopularProducts($excludeIds = []) : array
{
$headers = $this->headers;
$headers['Accept'] = 'application/json';
$headers['X-CSRFToken'] = $this->csrfToken;
$options = [
RequestOptions::HEADERS => $headers,
RequestOptions::COOKIES => $this->cookies,
RequestOptions::JSON => [
'exclude_ids' => $excludeIds,
],
];
$body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options);
return json_decode($body, true);
}
/**
* Runs the scraper to scrape popular products
*
* This relies on the Oda API, unlike the scraping.
*
* @return void
*/
public function fetchPopular() : void
{
$this->initialize();
$excludeIds = [];
$count = 0;
$products = [];
$initialRequest = $this->getPopularProducts();
$results = $initialRequest['results'];
$products = array_merge($products, $results);
$productIds = array_column($results, 'id');
$excludeIds = array_merge($excludeIds, $productIds);
$hasMore = $initialRequest['has_more'];
echo sprintf('Scraped %d products', count($products)) . PHP_EOL;
while ($hasMore) {
$count++;
$request = $this->getPopularProducts($excludeIds);
$results = $request['results'];
$products = array_merge($products, $results);
echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL;
$productIds = array_column($results, 'id');
$excludeIds = array_merge($excludeIds, $productIds);
$hasMore = $request['has_more'];
if (!$hasMore) {
break;
}
}
$filename = sprintf('%s/data/%s.json', __DIR__, time());
file_put_contents($filename, json_encode($products));
}
/**
* Takes a Oda product page URL and scrapes the product data.
*
* @param string $url
*
* @return array
*/
public function scrapeProductPage(string $url) : array
{
$body = $this->getRequestBody($url);
$crawler = new Crawler($body);
$name = trim((string) $crawler->filter('h1')->text());
$price = (float) $crawler->filter('.price')->attr('content');
/**
* Unit price handler.
* Example: "kr 187,50 per unit"
*/
$priceUnitText = trim((string) $crawler->filter('.unit-price')->text());
// Remove redundant text
$priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText);
/**
* Split the price and unit.
*
* In theory the results should be something like:
* - [0] => 187.50
* - [1] => unit
*/
$priceUnit = explode(' ', trim($priceUnitText));
return [
'name' => $name,
'price' => $price,
'unit_price' => (float) $priceUnit[0],
'unit' => $priceUnit[1],
];
}
/**
* Writes the scraped data to a CSV & a JSON file.
*
* @param array $products
*
* @return void
*/
private function writeProductsToFile(array $products) : void
{
if (empty($products)) {
return;
}
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
if (empty($outputFile)) {
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
return;
}
// Output JSON
$jsonFile = str_replace('.csv', '.json', $outputFile);
file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT));
$csvFile = str_replace('.json', '.csv', $jsonFile);
$handle = fopen($csvFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course.
$csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';';
$columns = array_keys($products[0]);
fputcsv($handle, $columns, $csvSeparator);
foreach ($products as $product)
{
/**
* Special handler for arrays inside the product array.
* To retain the data, we 'convert' the array to a string.
*/
foreach ($product as $subIdx => $subItem)
{
if (is_array($subItem))
{
$product[$subIdx] = implode(',', $subItem);
}
}
fputcsv($handle, $product, $csvSeparator);
}
fclose($handle);
}
/**
* Scrapes products from the specified `PRODUCT_URL_LIST` file.
* Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future.
*
* Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format.
*
* @return array
*/
public function scrapeFromFile() : array
{
$filename = $_ENV['PRODUCT_URL_LIST'];
if (empty($filename)) {
echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL;
return [];
}
$inputHandle = fopen($filename, 'r');
$products = [];
while (($line = fgets($inputHandle)) !== false) {
$url = trim($line);
$product = $this->scrapeProductPage($url);
$products[] = $product;
}
fclose($inputHandle);
$this->writeProductsToFile($products);
return $products;
}
}
$oda = new Oda();
$oda->scrapeFromFile();