Refactor unit price handling + CSV/JSON output files

This commit is contained in:
Alex Thomassen 2022-10-31 18:12:53 +00:00
parent c978ceb26a
commit 754a2a357a
Signed by untrusted user: Alex
GPG Key ID: 10BD786B5F6FF5DE
2 changed files with 48 additions and 23 deletions

View File

@ -10,5 +10,8 @@ PRODUCT_OUTPUT_FILE=./data/products.csv
# Can be either 'json' or 'csv'
PRODUCT_OUTPUT_FORMAT=json
# Only relevant if PRODUCT_OUTPUT_FORMAT is 'csv'. The CSV delimiter to use in the output file.
CSV_SEPARATOR=";"
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"

View File

@ -207,21 +207,35 @@ class Oda
$name = trim((string) $crawler->filter('h1')->text());
$price = (float) $crawler->filter('.price')->attr('content');
$price_unit_string = trim((string) $crawler->filter('.unit-price')->text());
## Møkkete datavask på string begynn kr 50,50 per unit
$price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit
$price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit
$unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit
/**
* Unit price handler.
* Example: "kr 187,50 per unit"
*/
$priceUnitText = trim((string) $crawler->filter('.unit-price')->text());
// Remove redundant text
$priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText);
/**
* Split the price and unit.
*
* In theory the results should be something like:
* - [0] => 187.50
* - [1] => unit
*/
$priceUnit = explode(' ', trim($priceUnitText));
return [
'name' => $name,
'price' => $price,
'unit_type' => $unit_data,
'unit_price' => (float) $priceUnit[0],
'unit' => $priceUnit[1],
];
}
/**
* Writes the scraped data to a CSV file.
* Writes the scraped data to a CSV & a JSON file.
*
* @param array $products
*
@ -233,36 +247,44 @@ class Oda
return;
}
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
if (empty($outputFormat) || empty($outputFile)) {
if (empty($outputFile)) {
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
return;
}
if ($outputFormat === 'json') {
file_put_contents($outputFile, json_encode($products));
return;
}
// Output JSON
$jsonFile = str_replace('.csv', '.json', $outputFile);
file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT));
if ($outputFormat === 'csv') {
$handle = fopen($outputFile, 'w');
$csvFile = str_replace('.json', '.csv', $jsonFile);
$handle = fopen($csvFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course.
$columns = array_keys($products[0]);
fputcsv($handle, $columns);
// Add the first row as headers. Assuming all products are formatted the same, of course.
$csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';';
$columns = array_keys($products[0]);
fputcsv($handle, $columns, $csvSeparator);
foreach ($products as $product)
foreach ($products as $product)
{
/**
* Special handler for arrays inside the product array.
* To retain the data, we 'convert' the array to a string.
*/
foreach ($product as $subIdx => $subItem)
{
fputcsv($handle, $product);
if (is_array($subItem))
{
$product[$subIdx] = implode(',', $subItem);
}
}
fclose($handle);
return;
fputcsv($handle, $product, $csvSeparator);
}
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
fclose($handle);
}
/**