Oda/OdaScraper.php

302 lines
8.2 KiB
PHP
Raw Normal View History

2022-10-29 23:47:05 +02:00
<?php
namespace Decicus\Scraper;
require __DIR__ . '/vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
use GuzzleHttp\Cookie\CookieJar;
use GuzzleHttp\Client as HttpClient;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\RequestOptions;
if (file_exists(__DIR__ . '/.env')) {
/**
* Make `.env` variables via getenv().
*/
$env = \Dotenv\Dotenv::createImmutable(__DIR__, '.env');
$env->load();
}
class Oda
{
/**
* CookieJar for Guzzle
*
* @var GuzzleHttp\Cookie\CookieJar
*/
private CookieJar $cookies;
/**
* URL to send the initial request (for cookies).
*
* @var string
*/
private string $initUrl = 'https://oda.com/no/products/popular/';
/**
* URL for querying the API.
*
* @var string
*/
private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/';
/**
* Base HTTP headers
*
* API requires additional headers:
* - Accept: application/json (probably)
* - Cookie
* - X-CSRFToken
*
* @var array
*/
private array $headers = [
'Origin' => 'https://oda.com',
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
];
/**
* CSRF token used for API requests
*
* @var string
*/
private ?string $csrfToken = null;
/**
* HTTP Client
*
* @var \GuzzleHttp\Client
*/
private ?HttpClient $client;
public function __construct()
{
$this->cookies = new CookieJar();
$this->client = new HttpClient(['cookies' => true]);
if (!empty($_ENV['USER_AGENT'])) {
$this->headers['User-Agent'] = $_ENV['USER_AGENT'];
}
}
/**
* Does the initial request to get the CSRF token and cookies.
*/
public function initialize() : void
{
$options = [
RequestOptions::HEADERS => $this->headers,
RequestOptions::COOKIES => $this->cookies,
];
$this->client->request('GET', $this->initUrl, $options);
$csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null;
$this->csrfToken = $csrfToken;
}
/**
* Helper function for sending requests via the HTTP client.
*
* @param string $url
* @param string $method HTTP method (GET, POST, PUT etc.)
* @param array $options Guzzle options
*
* @return string
*/
private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string
{
if (empty($options)) {
$options = [
RequestOptions::HEADERS => $this->headers,
RequestOptions::COOKIES => $this->cookies,
];
}
$response = $this->client->request($method, $url, $options);
$body = $response->getBody()->getContents();
return $body;
}
/**
* Helper function for making products API requests.
*
* @param array $excludeIds
*
* @return array
*/
public function getPopularProducts($excludeIds = []) : array
{
$headers = $this->headers;
$headers['Accept'] = 'application/json';
$headers['X-CSRFToken'] = $this->csrfToken;
$options = [
RequestOptions::HEADERS => $headers,
RequestOptions::COOKIES => $this->cookies,
RequestOptions::JSON => [
'exclude_ids' => $excludeIds,
],
];
$body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options);
return json_decode($body, true);
}
/**
* Runs the scraper to scrape popular products
*
* This relies on the Oda API, unlike the scraping.
*
* @return void
*/
public function fetchPopular() : void
{
$this->initialize();
$excludeIds = [];
$count = 0;
$products = [];
$initialRequest = $this->getPopularProducts();
$results = $initialRequest['results'];
$products = array_merge($products, $results);
$productIds = array_column($results, 'id');
$excludeIds = array_merge($excludeIds, $productIds);
$hasMore = $initialRequest['has_more'];
echo sprintf('Scraped %d products', count($products)) . PHP_EOL;
while ($hasMore) {
$count++;
$request = $this->getPopularProducts($excludeIds);
$results = $request['results'];
$products = array_merge($products, $results);
echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL;
$productIds = array_column($results, 'id');
$excludeIds = array_merge($excludeIds, $productIds);
$hasMore = $request['has_more'];
if (!$hasMore) {
break;
}
}
$filename = sprintf('%s/data/%s.json', __DIR__, time());
file_put_contents($filename, json_encode($products));
}
/**
* Takes a Oda product page URL and scrapes the product data.
*
* @param string $url
*
* @return array
*/
public function scrapeProductPage(string $url) : array
{
$body = $this->getRequestBody($url);
$crawler = new Crawler($body);
$name = trim((string) $crawler->filter('h1')->text());
$price = (float) $crawler->filter('.price')->attr('content');
2022-10-31 18:24:16 +01:00
$price_unit_string = trim((string) $crawler->filter('.unit-price')->text());
## Møkkete datavask på string begynn kr 50,50 per unit
$price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit
$price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit
$unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit
2022-10-29 23:47:05 +02:00
return [
'name' => $name,
'price' => $price,
2022-10-31 18:24:16 +01:00
'unit_type' => $unit_data,
2022-10-29 23:47:05 +02:00
];
}
/**
* Writes the scraped data to a CSV file.
*
* @param array $products
*
* @return void
*/
private function writeProductsToFile(array $products) : void
{
if (empty($products)) {
return;
}
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
if (empty($outputFormat) || empty($outputFile)) {
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
return;
}
if ($outputFormat === 'json') {
file_put_contents($outputFile, json_encode($products));
return;
}
if ($outputFormat === 'csv') {
$handle = fopen($outputFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course.
$columns = array_keys($products[0]);
fputcsv($handle, $columns);
foreach ($products as $product)
{
fputcsv($handle, $product);
}
fclose($handle);
return;
}
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
}
/**
* Scrapes products from the specified `PRODUCT_URL_LIST` file.
* Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future.
*
* Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format.
*
* @return array
*/
public function scrapeFromFile() : array
{
$filename = $_ENV['PRODUCT_URL_LIST'];
if (empty($filename)) {
echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL;
return [];
}
$inputHandle = fopen($filename, 'r');
$products = [];
while (($line = fgets($inputHandle)) !== false) {
$url = trim($line);
$product = $this->scrapeProductPage($url);
$products[] = $product;
}
fclose($inputHandle);
$this->writeProductsToFile($products);
return $products;
}
}
$oda = new Oda();
$oda->scrapeFromFile();