Initial commit

This commit is contained in:
Alex Thomassen 2022-10-29 21:47:05 +00:00
commit 14b5cc2383
Signed by untrusted user: Alex
GPG Key ID: 10BD786B5F6FF5DE
11 changed files with 1664 additions and 0 deletions

14
.env.example Normal file
View File

@ -0,0 +1,14 @@
# Relative paths are relative to the current working directory (`pwd`), so explicit full paths are recommended.
# See *.sample.* files for test runs and examples.
# Simply a list of Oda.com product URLs to scrape. One URL per line.
PRODUCT_URL_LIST=./data/products.txt
# The output file for the scraped data. This file will be overwritten if it already exists.
PRODUCT_OUTPUT_FILE=./data/products.csv
# Can be either 'json' or 'csv'
PRODUCT_OUTPUT_FORMAT=json
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"

2
.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
vendor
.env

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2022 Alex Thomassen <alex@cocks.no>
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

296
OdaScraper.php Normal file
View File

@ -0,0 +1,296 @@
<?php
namespace Decicus\Scraper;
require __DIR__ . '/vendor/autoload.php';
use Symfony\Component\DomCrawler\Crawler;
use GuzzleHttp\Cookie\CookieJar;
use GuzzleHttp\Client as HttpClient;
use GuzzleHttp\Psr7\Request;
use GuzzleHttp\RequestOptions;
if (file_exists(__DIR__ . '/.env')) {
/**
* Make `.env` variables via getenv().
*/
$env = \Dotenv\Dotenv::createImmutable(__DIR__, '.env');
$env->load();
}
class Oda
{
/**
* CookieJar for Guzzle
*
* @var GuzzleHttp\Cookie\CookieJar
*/
private CookieJar $cookies;
/**
* URL to send the initial request (for cookies).
*
* @var string
*/
private string $initUrl = 'https://oda.com/no/products/popular/';
/**
* URL for querying the API.
*
* @var string
*/
private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/';
/**
* Base HTTP headers
*
* API requires additional headers:
* - Accept: application/json (probably)
* - Cookie
* - X-CSRFToken
*
* @var array
*/
private array $headers = [
'Origin' => 'https://oda.com',
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
];
/**
* CSRF token used for API requests
*
* @var string
*/
private ?string $csrfToken = null;
/**
* HTTP Client
*
* @var \GuzzleHttp\Client
*/
private ?HttpClient $client;
public function __construct()
{
$this->cookies = new CookieJar();
$this->client = new HttpClient(['cookies' => true]);
if (!empty($_ENV['USER_AGENT'])) {
$this->headers['User-Agent'] = $_ENV['USER_AGENT'];
}
}
/**
* Does the initial request to get the CSRF token and cookies.
*/
public function initialize() : void
{
$options = [
RequestOptions::HEADERS => $this->headers,
RequestOptions::COOKIES => $this->cookies,
];
$this->client->request('GET', $this->initUrl, $options);
$csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null;
$this->csrfToken = $csrfToken;
}
/**
* Helper function for sending requests via the HTTP client.
*
* @param string $url
* @param string $method HTTP method (GET, POST, PUT etc.)
* @param array $options Guzzle options
*
* @return string
*/
private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string
{
if (empty($options)) {
$options = [
RequestOptions::HEADERS => $this->headers,
RequestOptions::COOKIES => $this->cookies,
];
}
$response = $this->client->request($method, $url, $options);
$body = $response->getBody()->getContents();
return $body;
}
/**
* Helper function for making products API requests.
*
* @param array $excludeIds
*
* @return array
*/
public function getPopularProducts($excludeIds = []) : array
{
$headers = $this->headers;
$headers['Accept'] = 'application/json';
$headers['X-CSRFToken'] = $this->csrfToken;
$options = [
RequestOptions::HEADERS => $headers,
RequestOptions::COOKIES => $this->cookies,
RequestOptions::JSON => [
'exclude_ids' => $excludeIds,
],
];
$body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options);
return json_decode($body, true);
}
/**
* Runs the scraper to scrape popular products
*
* This relies on the Oda API, unlike the scraping.
*
* @return void
*/
public function fetchPopular() : void
{
$this->initialize();
$excludeIds = [];
$count = 0;
$products = [];
$initialRequest = $this->getPopularProducts();
$results = $initialRequest['results'];
$products = array_merge($products, $results);
$productIds = array_column($results, 'id');
$excludeIds = array_merge($excludeIds, $productIds);
$hasMore = $initialRequest['has_more'];
echo sprintf('Scraped %d products', count($products)) . PHP_EOL;
while ($hasMore) {
$count++;
$request = $this->getPopularProducts($excludeIds);
$results = $request['results'];
$products = array_merge($products, $results);
echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL;
$productIds = array_column($results, 'id');
$excludeIds = array_merge($excludeIds, $productIds);
$hasMore = $request['has_more'];
if (!$hasMore) {
break;
}
}
$filename = sprintf('%s/data/%s.json', __DIR__, time());
file_put_contents($filename, json_encode($products));
}
/**
* Takes a Oda product page URL and scrapes the product data.
*
* @param string $url
*
* @return array
*/
public function scrapeProductPage(string $url) : array
{
$body = $this->getRequestBody($url);
$crawler = new Crawler($body);
$name = trim((string) $crawler->filter('h1')->text());
$price = (float) $crawler->filter('.price')->attr('content');
return [
'name' => $name,
'price' => $price,
];
}
/**
* Writes the scraped data to a CSV file.
*
* @param array $products
*
* @return void
*/
private function writeProductsToFile(array $products) : void
{
if (empty($products)) {
return;
}
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
if (empty($outputFormat) || empty($outputFile)) {
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
return;
}
if ($outputFormat === 'json') {
file_put_contents($outputFile, json_encode($products));
return;
}
if ($outputFormat === 'csv') {
$handle = fopen($outputFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course.
$columns = array_keys($products[0]);
fputcsv($handle, $columns);
foreach ($products as $product)
{
fputcsv($handle, $product);
}
fclose($handle);
return;
}
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
}
/**
* Scrapes products from the specified `PRODUCT_URL_LIST` file.
* Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future.
*
* Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format.
*
* @return array
*/
public function scrapeFromFile() : array
{
$filename = $_ENV['PRODUCT_URL_LIST'];
if (empty($filename)) {
echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL;
return [];
}
$inputHandle = fopen($filename, 'r');
$products = [];
while (($line = fgets($inputHandle)) !== false) {
$url = trim($line);
$product = $this->scrapeProductPage($url);
$products[] = $product;
}
fclose($inputHandle);
$this->writeProductsToFile($products);
return $products;
}
}
$oda = new Oda();
$oda->scrapeFromFile();

16
README.md Normal file
View File

@ -0,0 +1,16 @@
# Scraper for Oda.com
## Requirements
- PHP 8.1 (cli)
- PHP extension: `php-xml`
- PHP extension: `php-curl`
- [`Composer`](https://getcomposer.org/)
- Other PHP extensions may also be necessary to install. Composer will likely complain when you run `composer install`.
## Setup
- Copy `.env.example` to `.env` and fill in the values as you see fit.
- Run `composer install` to install dependencies.
- Run `php OdaScraper.php` to scrape the products.
- Script something to send the output files to wherever you want?

8
composer.json Normal file
View File

@ -0,0 +1,8 @@
{
"require": {
"guzzlehttp/guzzle": "^7.5",
"symfony/dom-crawler": "^6.1",
"symfony/css-selector": "^6.1",
"vlucas/phpdotenv": "^5.5"
}
}

1294
composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

3
data/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
*
!.gitignore
!*.sample.*

5
data/products.sample.csv Normal file
View File

@ -0,0 +1,5 @@
name,price
"Ternet Bacon 2x125g, 250 g",45.7
"Hamar Julebrus 6 x 0,33l, 1,98 l",128.4
"Frokostegg Fra Frittgående Høner Str L, 12 stk",43
"Monster Ultra Watermelon 0,5 l",23
1 name price
2 Ternet Bacon 2x125g, 250 g 45.7
3 Hamar Julebrus 6 x 0,33l, 1,98 l 128.4
4 Frokostegg Fra Frittgående Høner Str L, 12 stk 43
5 Monster Ultra Watermelon 0,5 l 23

View File

@ -0,0 +1 @@
[{"name":"Ternet Bacon 2x125g, 250 g","price":45.7},{"name":"Hamar Julebrus 6 x 0,33l, 1,98 l","price":128.4},{"name":"Frokostegg Fra Frittg\u00e5ende H\u00f8ner Str L, 12 stk","price":43},{"name":"Monster Ultra Watermelon 0,5 l","price":23}]

4
data/products.sample.txt Normal file
View File

@ -0,0 +1,4 @@
https://oda.com/no/products/27923-tulip-ternet-bacon-2x125g/
https://oda.com/no/products/23492-ringnes-hamar-julebrus-6-x-033l/
https://oda.com/no/products/28870-prior-frokostegg-fra-frittgaende-honer-str-l/
https://oda.com/no/products/40760-monster-monster-ultra-watermelon/