296 lines
7.7 KiB
PHP
296 lines
7.7 KiB
PHP
<?php
|
|
namespace Decicus\Scraper;
|
|
require __DIR__ . '/vendor/autoload.php';
|
|
|
|
use Symfony\Component\DomCrawler\Crawler;
|
|
use GuzzleHttp\Cookie\CookieJar;
|
|
use GuzzleHttp\Client as HttpClient;
|
|
use GuzzleHttp\Psr7\Request;
|
|
use GuzzleHttp\RequestOptions;
|
|
|
|
if (file_exists(__DIR__ . '/.env')) {
|
|
/**
|
|
* Make `.env` variables via getenv().
|
|
*/
|
|
$env = \Dotenv\Dotenv::createImmutable(__DIR__, '.env');
|
|
$env->load();
|
|
}
|
|
|
|
class Oda
|
|
{
|
|
/**
|
|
* CookieJar for Guzzle
|
|
*
|
|
* @var GuzzleHttp\Cookie\CookieJar
|
|
*/
|
|
private CookieJar $cookies;
|
|
|
|
/**
|
|
* URL to send the initial request (for cookies).
|
|
*
|
|
* @var string
|
|
*/
|
|
private string $initUrl = 'https://oda.com/no/products/popular/';
|
|
|
|
/**
|
|
* URL for querying the API.
|
|
*
|
|
* @var string
|
|
*/
|
|
private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/';
|
|
|
|
/**
|
|
* Base HTTP headers
|
|
*
|
|
* API requires additional headers:
|
|
* - Accept: application/json (probably)
|
|
* - Cookie
|
|
* - X-CSRFToken
|
|
*
|
|
* @var array
|
|
*/
|
|
private array $headers = [
|
|
'Origin' => 'https://oda.com',
|
|
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
|
|
];
|
|
|
|
/**
|
|
* CSRF token used for API requests
|
|
*
|
|
* @var string
|
|
*/
|
|
private ?string $csrfToken = null;
|
|
|
|
/**
|
|
* HTTP Client
|
|
*
|
|
* @var \GuzzleHttp\Client
|
|
*/
|
|
private ?HttpClient $client;
|
|
|
|
public function __construct()
|
|
{
|
|
$this->cookies = new CookieJar();
|
|
$this->client = new HttpClient(['cookies' => true]);
|
|
|
|
if (!empty($_ENV['USER_AGENT'])) {
|
|
$this->headers['User-Agent'] = $_ENV['USER_AGENT'];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Does the initial request to get the CSRF token and cookies.
|
|
*/
|
|
public function initialize() : void
|
|
{
|
|
$options = [
|
|
RequestOptions::HEADERS => $this->headers,
|
|
RequestOptions::COOKIES => $this->cookies,
|
|
];
|
|
|
|
$this->client->request('GET', $this->initUrl, $options);
|
|
$csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null;
|
|
|
|
$this->csrfToken = $csrfToken;
|
|
}
|
|
|
|
/**
|
|
* Helper function for sending requests via the HTTP client.
|
|
*
|
|
* @param string $url
|
|
* @param string $method HTTP method (GET, POST, PUT etc.)
|
|
* @param array $options Guzzle options
|
|
*
|
|
* @return string
|
|
*/
|
|
private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string
|
|
{
|
|
if (empty($options)) {
|
|
$options = [
|
|
RequestOptions::HEADERS => $this->headers,
|
|
RequestOptions::COOKIES => $this->cookies,
|
|
];
|
|
}
|
|
|
|
$response = $this->client->request($method, $url, $options);
|
|
$body = $response->getBody()->getContents();
|
|
return $body;
|
|
}
|
|
|
|
/**
|
|
* Helper function for making products API requests.
|
|
*
|
|
* @param array $excludeIds
|
|
*
|
|
* @return array
|
|
*/
|
|
public function getPopularProducts($excludeIds = []) : array
|
|
{
|
|
$headers = $this->headers;
|
|
$headers['Accept'] = 'application/json';
|
|
$headers['X-CSRFToken'] = $this->csrfToken;
|
|
|
|
$options = [
|
|
RequestOptions::HEADERS => $headers,
|
|
RequestOptions::COOKIES => $this->cookies,
|
|
RequestOptions::JSON => [
|
|
'exclude_ids' => $excludeIds,
|
|
],
|
|
];
|
|
|
|
$body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options);
|
|
|
|
return json_decode($body, true);
|
|
}
|
|
|
|
/**
|
|
* Runs the scraper to scrape popular products
|
|
*
|
|
* This relies on the Oda API, unlike the scraping.
|
|
*
|
|
* @return void
|
|
*/
|
|
public function fetchPopular() : void
|
|
{
|
|
$this->initialize();
|
|
|
|
$excludeIds = [];
|
|
$count = 0;
|
|
$products = [];
|
|
|
|
$initialRequest = $this->getPopularProducts();
|
|
|
|
$results = $initialRequest['results'];
|
|
|
|
$products = array_merge($products, $results);
|
|
$productIds = array_column($results, 'id');
|
|
$excludeIds = array_merge($excludeIds, $productIds);
|
|
|
|
$hasMore = $initialRequest['has_more'];
|
|
|
|
echo sprintf('Scraped %d products', count($products)) . PHP_EOL;
|
|
|
|
while ($hasMore) {
|
|
$count++;
|
|
$request = $this->getPopularProducts($excludeIds);
|
|
|
|
$results = $request['results'];
|
|
$products = array_merge($products, $results);
|
|
|
|
echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL;
|
|
|
|
$productIds = array_column($results, 'id');
|
|
$excludeIds = array_merge($excludeIds, $productIds);
|
|
|
|
$hasMore = $request['has_more'];
|
|
|
|
if (!$hasMore) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
$filename = sprintf('%s/data/%s.json', __DIR__, time());
|
|
file_put_contents($filename, json_encode($products));
|
|
}
|
|
|
|
/**
|
|
* Takes a Oda product page URL and scrapes the product data.
|
|
*
|
|
* @param string $url
|
|
*
|
|
* @return array
|
|
*/
|
|
public function scrapeProductPage(string $url) : array
|
|
{
|
|
$body = $this->getRequestBody($url);
|
|
$crawler = new Crawler($body);
|
|
|
|
$name = trim((string) $crawler->filter('h1')->text());
|
|
$price = (float) $crawler->filter('.price')->attr('content');
|
|
|
|
return [
|
|
'name' => $name,
|
|
'price' => $price,
|
|
];
|
|
}
|
|
|
|
/**
|
|
* Writes the scraped data to a CSV file.
|
|
*
|
|
* @param array $products
|
|
*
|
|
* @return void
|
|
*/
|
|
private function writeProductsToFile(array $products) : void
|
|
{
|
|
if (empty($products)) {
|
|
return;
|
|
}
|
|
|
|
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
|
|
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
|
|
|
|
if (empty($outputFormat) || empty($outputFile)) {
|
|
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
|
|
return;
|
|
}
|
|
|
|
if ($outputFormat === 'json') {
|
|
file_put_contents($outputFile, json_encode($products));
|
|
return;
|
|
}
|
|
|
|
if ($outputFormat === 'csv') {
|
|
$handle = fopen($outputFile, 'w');
|
|
|
|
// Add the first row as headers. Assuming all products are formatted the same, of course.
|
|
$columns = array_keys($products[0]);
|
|
fputcsv($handle, $columns);
|
|
|
|
foreach ($products as $product)
|
|
{
|
|
fputcsv($handle, $product);
|
|
}
|
|
|
|
fclose($handle);
|
|
return;
|
|
}
|
|
|
|
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
|
|
}
|
|
|
|
/**
|
|
* Scrapes products from the specified `PRODUCT_URL_LIST` file.
|
|
* Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future.
|
|
*
|
|
* Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format.
|
|
*
|
|
* @return array
|
|
*/
|
|
public function scrapeFromFile() : array
|
|
{
|
|
$filename = $_ENV['PRODUCT_URL_LIST'];
|
|
|
|
if (empty($filename)) {
|
|
echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL;
|
|
return [];
|
|
}
|
|
|
|
$inputHandle = fopen($filename, 'r');
|
|
$products = [];
|
|
|
|
while (($line = fgets($inputHandle)) !== false) {
|
|
$url = trim($line);
|
|
$product = $this->scrapeProductPage($url);
|
|
$products[] = $product;
|
|
}
|
|
|
|
fclose($inputHandle);
|
|
|
|
$this->writeProductsToFile($products);
|
|
return $products;
|
|
}
|
|
}
|
|
|
|
$oda = new Oda();
|
|
$oda->scrapeFromFile(); |