Compare commits

..

No commits in common. "main" and "main" have entirely different histories.
main ... main

2 changed files with 29 additions and 54 deletions

View File

@ -10,8 +10,5 @@ PRODUCT_OUTPUT_FILE=./data/products.csv
# Can be either 'json' or 'csv'
PRODUCT_OUTPUT_FORMAT=json
# Only relevant if PRODUCT_OUTPUT_FORMAT is 'csv'. The CSV delimiter to use in the output file.
CSV_SEPARATOR=";"
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"

View File

@ -207,35 +207,21 @@ class Oda
$name = trim((string) $crawler->filter('h1')->text());
$price = (float) $crawler->filter('.price')->attr('content');
/**
* Unit price handler.
* Example: "kr 187,50 per unit"
*/
$priceUnitText = trim((string) $crawler->filter('.unit-price')->text());
// Remove redundant text
$priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText);
/**
* Split the price and unit.
*
* In theory the results should be something like:
* - [0] => 187.50
* - [1] => unit
*/
$priceUnit = explode(' ', trim($priceUnitText));
$price_unit_string = trim((string) $crawler->filter('.unit-price')->text());
## Møkkete datavask på string begynn kr 50,50 per unit
$price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit
$price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit
$unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit
return [
'name' => $name,
'price' => $price,
'unit_price' => (float) $priceUnit[0],
'unit' => $priceUnit[1],
'unit_type' => $unit_data,
];
}
/**
* Writes the scraped data to a CSV & a JSON file.
* Writes the scraped data to a CSV file.
*
* @param array $products
*
@ -247,44 +233,36 @@ class Oda
return;
}
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
if (empty($outputFile)) {
if (empty($outputFormat) || empty($outputFile)) {
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
return;
}
// Output JSON
$jsonFile = str_replace('.csv', '.json', $outputFile);
file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT));
$csvFile = str_replace('.json', '.csv', $jsonFile);
$handle = fopen($csvFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course.
$csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';';
$columns = array_keys($products[0]);
fputcsv($handle, $columns, $csvSeparator);
foreach ($products as $product)
{
/**
* Special handler for arrays inside the product array.
* To retain the data, we 'convert' the array to a string.
*/
foreach ($product as $subIdx => $subItem)
{
if (is_array($subItem))
{
$product[$subIdx] = implode(',', $subItem);
}
}
fputcsv($handle, $product, $csvSeparator);
if ($outputFormat === 'json') {
file_put_contents($outputFile, json_encode($products));
return;
}
fclose($handle);
if ($outputFormat === 'csv') {
$handle = fopen($outputFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course.
$columns = array_keys($products[0]);
fputcsv($handle, $columns);
foreach ($products as $product)
{
fputcsv($handle, $product);
}
fclose($handle);
return;
}
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
}
/**