Initial commit
This commit is contained in:
commit
14b5cc2383
14
.env.example
Normal file
14
.env.example
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
# Relative paths are relative to the current working directory (`pwd`), so explicit full paths are recommended.
|
||||||
|
# See *.sample.* files for test runs and examples.
|
||||||
|
|
||||||
|
# Simply a list of Oda.com product URLs to scrape. One URL per line.
|
||||||
|
PRODUCT_URL_LIST=./data/products.txt
|
||||||
|
|
||||||
|
# The output file for the scraped data. This file will be overwritten if it already exists.
|
||||||
|
PRODUCT_OUTPUT_FILE=./data/products.csv
|
||||||
|
|
||||||
|
# Can be either 'json' or 'csv'
|
||||||
|
PRODUCT_OUTPUT_FORMAT=json
|
||||||
|
|
||||||
|
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
|
||||||
|
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
|
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
vendor
|
||||||
|
.env
|
21
LICENSE
Normal file
21
LICENSE
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
MIT License
|
||||||
|
|
||||||
|
Copyright (c) 2022 Alex Thomassen <alex@cocks.no>
|
||||||
|
|
||||||
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
in the Software without restriction, including without limitation the rights
|
||||||
|
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||||
|
copies of the Software, and to permit persons to whom the Software is
|
||||||
|
furnished to do so, subject to the following conditions:
|
||||||
|
|
||||||
|
The above copyright notice and this permission notice shall be included in all
|
||||||
|
copies or substantial portions of the Software.
|
||||||
|
|
||||||
|
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||||
|
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||||
|
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
|
SOFTWARE.
|
296
OdaScraper.php
Normal file
296
OdaScraper.php
Normal file
@ -0,0 +1,296 @@
|
|||||||
|
<?php
|
||||||
|
namespace Decicus\Scraper;
|
||||||
|
require __DIR__ . '/vendor/autoload.php';
|
||||||
|
|
||||||
|
use Symfony\Component\DomCrawler\Crawler;
|
||||||
|
use GuzzleHttp\Cookie\CookieJar;
|
||||||
|
use GuzzleHttp\Client as HttpClient;
|
||||||
|
use GuzzleHttp\Psr7\Request;
|
||||||
|
use GuzzleHttp\RequestOptions;
|
||||||
|
|
||||||
|
if (file_exists(__DIR__ . '/.env')) {
|
||||||
|
/**
|
||||||
|
* Make `.env` variables via getenv().
|
||||||
|
*/
|
||||||
|
$env = \Dotenv\Dotenv::createImmutable(__DIR__, '.env');
|
||||||
|
$env->load();
|
||||||
|
}
|
||||||
|
|
||||||
|
class Oda
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* CookieJar for Guzzle
|
||||||
|
*
|
||||||
|
* @var GuzzleHttp\Cookie\CookieJar
|
||||||
|
*/
|
||||||
|
private CookieJar $cookies;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* URL to send the initial request (for cookies).
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private string $initUrl = 'https://oda.com/no/products/popular/';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* URL for querying the API.
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private string $apiProductsUrl = 'https://oda.com/api/v1/products/popular/';
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Base HTTP headers
|
||||||
|
*
|
||||||
|
* API requires additional headers:
|
||||||
|
* - Accept: application/json (probably)
|
||||||
|
* - Cookie
|
||||||
|
* - X-CSRFToken
|
||||||
|
*
|
||||||
|
* @var array
|
||||||
|
*/
|
||||||
|
private array $headers = [
|
||||||
|
'Origin' => 'https://oda.com',
|
||||||
|
'User-Agent' => 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36',
|
||||||
|
];
|
||||||
|
|
||||||
|
/**
|
||||||
|
* CSRF token used for API requests
|
||||||
|
*
|
||||||
|
* @var string
|
||||||
|
*/
|
||||||
|
private ?string $csrfToken = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* HTTP Client
|
||||||
|
*
|
||||||
|
* @var \GuzzleHttp\Client
|
||||||
|
*/
|
||||||
|
private ?HttpClient $client;
|
||||||
|
|
||||||
|
public function __construct()
|
||||||
|
{
|
||||||
|
$this->cookies = new CookieJar();
|
||||||
|
$this->client = new HttpClient(['cookies' => true]);
|
||||||
|
|
||||||
|
if (!empty($_ENV['USER_AGENT'])) {
|
||||||
|
$this->headers['User-Agent'] = $_ENV['USER_AGENT'];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Does the initial request to get the CSRF token and cookies.
|
||||||
|
*/
|
||||||
|
public function initialize() : void
|
||||||
|
{
|
||||||
|
$options = [
|
||||||
|
RequestOptions::HEADERS => $this->headers,
|
||||||
|
RequestOptions::COOKIES => $this->cookies,
|
||||||
|
];
|
||||||
|
|
||||||
|
$this->client->request('GET', $this->initUrl, $options);
|
||||||
|
$csrfToken = $this->cookies->getCookieByName('csrftoken')->getValue() ?? null;
|
||||||
|
|
||||||
|
$this->csrfToken = $csrfToken;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function for sending requests via the HTTP client.
|
||||||
|
*
|
||||||
|
* @param string $url
|
||||||
|
* @param string $method HTTP method (GET, POST, PUT etc.)
|
||||||
|
* @param array $options Guzzle options
|
||||||
|
*
|
||||||
|
* @return string
|
||||||
|
*/
|
||||||
|
private function getRequestBody(string $url, string $method = 'GET', array $options = []) : string
|
||||||
|
{
|
||||||
|
if (empty($options)) {
|
||||||
|
$options = [
|
||||||
|
RequestOptions::HEADERS => $this->headers,
|
||||||
|
RequestOptions::COOKIES => $this->cookies,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
$response = $this->client->request($method, $url, $options);
|
||||||
|
$body = $response->getBody()->getContents();
|
||||||
|
return $body;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Helper function for making products API requests.
|
||||||
|
*
|
||||||
|
* @param array $excludeIds
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function getPopularProducts($excludeIds = []) : array
|
||||||
|
{
|
||||||
|
$headers = $this->headers;
|
||||||
|
$headers['Accept'] = 'application/json';
|
||||||
|
$headers['X-CSRFToken'] = $this->csrfToken;
|
||||||
|
|
||||||
|
$options = [
|
||||||
|
RequestOptions::HEADERS => $headers,
|
||||||
|
RequestOptions::COOKIES => $this->cookies,
|
||||||
|
RequestOptions::JSON => [
|
||||||
|
'exclude_ids' => $excludeIds,
|
||||||
|
],
|
||||||
|
];
|
||||||
|
|
||||||
|
$body = $this->getRequestBody($this->apiProductsUrl, 'POST', $options);
|
||||||
|
|
||||||
|
return json_decode($body, true);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Runs the scraper to scrape popular products
|
||||||
|
*
|
||||||
|
* This relies on the Oda API, unlike the scraping.
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
public function fetchPopular() : void
|
||||||
|
{
|
||||||
|
$this->initialize();
|
||||||
|
|
||||||
|
$excludeIds = [];
|
||||||
|
$count = 0;
|
||||||
|
$products = [];
|
||||||
|
|
||||||
|
$initialRequest = $this->getPopularProducts();
|
||||||
|
|
||||||
|
$results = $initialRequest['results'];
|
||||||
|
|
||||||
|
$products = array_merge($products, $results);
|
||||||
|
$productIds = array_column($results, 'id');
|
||||||
|
$excludeIds = array_merge($excludeIds, $productIds);
|
||||||
|
|
||||||
|
$hasMore = $initialRequest['has_more'];
|
||||||
|
|
||||||
|
echo sprintf('Scraped %d products', count($products)) . PHP_EOL;
|
||||||
|
|
||||||
|
while ($hasMore) {
|
||||||
|
$count++;
|
||||||
|
$request = $this->getPopularProducts($excludeIds);
|
||||||
|
|
||||||
|
$results = $request['results'];
|
||||||
|
$products = array_merge($products, $results);
|
||||||
|
|
||||||
|
echo sprintf('Scraped %d/%d products', count($results), count($products)) . PHP_EOL;
|
||||||
|
|
||||||
|
$productIds = array_column($results, 'id');
|
||||||
|
$excludeIds = array_merge($excludeIds, $productIds);
|
||||||
|
|
||||||
|
$hasMore = $request['has_more'];
|
||||||
|
|
||||||
|
if (!$hasMore) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$filename = sprintf('%s/data/%s.json', __DIR__, time());
|
||||||
|
file_put_contents($filename, json_encode($products));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Takes a Oda product page URL and scrapes the product data.
|
||||||
|
*
|
||||||
|
* @param string $url
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function scrapeProductPage(string $url) : array
|
||||||
|
{
|
||||||
|
$body = $this->getRequestBody($url);
|
||||||
|
$crawler = new Crawler($body);
|
||||||
|
|
||||||
|
$name = trim((string) $crawler->filter('h1')->text());
|
||||||
|
$price = (float) $crawler->filter('.price')->attr('content');
|
||||||
|
|
||||||
|
return [
|
||||||
|
'name' => $name,
|
||||||
|
'price' => $price,
|
||||||
|
];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Writes the scraped data to a CSV file.
|
||||||
|
*
|
||||||
|
* @param array $products
|
||||||
|
*
|
||||||
|
* @return void
|
||||||
|
*/
|
||||||
|
private function writeProductsToFile(array $products) : void
|
||||||
|
{
|
||||||
|
if (empty($products)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
|
||||||
|
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
|
||||||
|
|
||||||
|
if (empty($outputFormat) || empty($outputFile)) {
|
||||||
|
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($outputFormat === 'json') {
|
||||||
|
file_put_contents($outputFile, json_encode($products));
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ($outputFormat === 'csv') {
|
||||||
|
$handle = fopen($outputFile, 'w');
|
||||||
|
|
||||||
|
// Add the first row as headers. Assuming all products are formatted the same, of course.
|
||||||
|
$columns = array_keys($products[0]);
|
||||||
|
fputcsv($handle, $columns);
|
||||||
|
|
||||||
|
foreach ($products as $product)
|
||||||
|
{
|
||||||
|
fputcsv($handle, $product);
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose($handle);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Scrapes products from the specified `PRODUCT_URL_LIST` file.
|
||||||
|
* Returns the scraped products as an array with name (string) + price (float), with maybe more values in the future.
|
||||||
|
*
|
||||||
|
* Will also write the `PRODUCT_OUTPUT_FILE` file with the `PRODUCT_OUTPUT_FORMAT` format.
|
||||||
|
*
|
||||||
|
* @return array
|
||||||
|
*/
|
||||||
|
public function scrapeFromFile() : array
|
||||||
|
{
|
||||||
|
$filename = $_ENV['PRODUCT_URL_LIST'];
|
||||||
|
|
||||||
|
if (empty($filename)) {
|
||||||
|
echo 'No product URL list specified in environment variables or .env, skipping...' . PHP_EOL;
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
$inputHandle = fopen($filename, 'r');
|
||||||
|
$products = [];
|
||||||
|
|
||||||
|
while (($line = fgets($inputHandle)) !== false) {
|
||||||
|
$url = trim($line);
|
||||||
|
$product = $this->scrapeProductPage($url);
|
||||||
|
$products[] = $product;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose($inputHandle);
|
||||||
|
|
||||||
|
$this->writeProductsToFile($products);
|
||||||
|
return $products;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
$oda = new Oda();
|
||||||
|
$oda->scrapeFromFile();
|
16
README.md
Normal file
16
README.md
Normal file
@ -0,0 +1,16 @@
|
|||||||
|
# Scraper for Oda.com
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- PHP 8.1 (cli)
|
||||||
|
- PHP extension: `php-xml`
|
||||||
|
- PHP extension: `php-curl`
|
||||||
|
- [`Composer`](https://getcomposer.org/)
|
||||||
|
- Other PHP extensions may also be necessary to install. Composer will likely complain when you run `composer install`.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
- Copy `.env.example` to `.env` and fill in the values as you see fit.
|
||||||
|
- Run `composer install` to install dependencies.
|
||||||
|
- Run `php OdaScraper.php` to scrape the products.
|
||||||
|
- Script something to send the output files to wherever you want?
|
8
composer.json
Normal file
8
composer.json
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
{
|
||||||
|
"require": {
|
||||||
|
"guzzlehttp/guzzle": "^7.5",
|
||||||
|
"symfony/dom-crawler": "^6.1",
|
||||||
|
"symfony/css-selector": "^6.1",
|
||||||
|
"vlucas/phpdotenv": "^5.5"
|
||||||
|
}
|
||||||
|
}
|
1294
composer.lock
generated
Normal file
1294
composer.lock
generated
Normal file
File diff suppressed because it is too large
Load Diff
3
data/.gitignore
vendored
Normal file
3
data/.gitignore
vendored
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
*
|
||||||
|
!.gitignore
|
||||||
|
!*.sample.*
|
5
data/products.sample.csv
Normal file
5
data/products.sample.csv
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
name,price
|
||||||
|
"Ternet Bacon 2x125g, 250 g",45.7
|
||||||
|
"Hamar Julebrus 6 x 0,33l, 1,98 l",128.4
|
||||||
|
"Frokostegg Fra Frittgående Høner Str L, 12 stk",43
|
||||||
|
"Monster Ultra Watermelon 0,5 l",23
|
|
1
data/products.sample.json
Normal file
1
data/products.sample.json
Normal file
@ -0,0 +1 @@
|
|||||||
|
[{"name":"Ternet Bacon 2x125g, 250 g","price":45.7},{"name":"Hamar Julebrus 6 x 0,33l, 1,98 l","price":128.4},{"name":"Frokostegg Fra Frittg\u00e5ende H\u00f8ner Str L, 12 stk","price":43},{"name":"Monster Ultra Watermelon 0,5 l","price":23}]
|
4
data/products.sample.txt
Normal file
4
data/products.sample.txt
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
https://oda.com/no/products/27923-tulip-ternet-bacon-2x125g/
|
||||||
|
https://oda.com/no/products/23492-ringnes-hamar-julebrus-6-x-033l/
|
||||||
|
https://oda.com/no/products/28870-prior-frokostegg-fra-frittgaende-honer-str-l/
|
||||||
|
https://oda.com/no/products/40760-monster-monster-ultra-watermelon/
|
Loading…
Reference in New Issue
Block a user