Refactor unit price handling + CSV/JSON output files

This commit is contained in:
Alex Thomassen 2022-10-31 18:12:53 +00:00
parent c978ceb26a
commit 754a2a357a
Signed by: Alex
GPG Key ID: 10BD786B5F6FF5DE
2 changed files with 48 additions and 23 deletions

View File

@ -10,5 +10,8 @@ PRODUCT_OUTPUT_FILE=./data/products.csv
# Can be either 'json' or 'csv' # Can be either 'json' or 'csv'
PRODUCT_OUTPUT_FORMAT=json PRODUCT_OUTPUT_FORMAT=json
# Only relevant if PRODUCT_OUTPUT_FORMAT is 'csv'. The CSV delimiter to use in the output file.
CSV_SEPARATOR=";"
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt. # User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"

View File

@ -207,21 +207,35 @@ class Oda
$name = trim((string) $crawler->filter('h1')->text()); $name = trim((string) $crawler->filter('h1')->text());
$price = (float) $crawler->filter('.price')->attr('content'); $price = (float) $crawler->filter('.price')->attr('content');
$price_unit_string = trim((string) $crawler->filter('.unit-price')->text());
## Møkkete datavask på string begynn kr 50,50 per unit /**
$price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit * Unit price handler.
$price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit * Example: "kr 187,50 per unit"
$unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit */
$priceUnitText = trim((string) $crawler->filter('.unit-price')->text());
// Remove redundant text
$priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText);
/**
* Split the price and unit.
*
* In theory the results should be something like:
* - [0] => 187.50
* - [1] => unit
*/
$priceUnit = explode(' ', trim($priceUnitText));
return [ return [
'name' => $name, 'name' => $name,
'price' => $price, 'price' => $price,
'unit_type' => $unit_data, 'unit_price' => (float) $priceUnit[0],
'unit' => $priceUnit[1],
]; ];
} }
/** /**
* Writes the scraped data to a CSV file. * Writes the scraped data to a CSV & a JSON file.
* *
* @param array $products * @param array $products
* *
@ -233,36 +247,44 @@ class Oda
return; return;
} }
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE']; $outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
if (empty($outputFormat) || empty($outputFile)) { if (empty($outputFile)) {
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL; echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
return; return;
} }
if ($outputFormat === 'json') { // Output JSON
file_put_contents($outputFile, json_encode($products)); $jsonFile = str_replace('.csv', '.json', $outputFile);
return; file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT));
}
if ($outputFormat === 'csv') { $csvFile = str_replace('.json', '.csv', $jsonFile);
$handle = fopen($outputFile, 'w'); $handle = fopen($csvFile, 'w');
// Add the first row as headers. Assuming all products are formatted the same, of course. // Add the first row as headers. Assuming all products are formatted the same, of course.
$csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';';
$columns = array_keys($products[0]); $columns = array_keys($products[0]);
fputcsv($handle, $columns); fputcsv($handle, $columns, $csvSeparator);
foreach ($products as $product) foreach ($products as $product)
{ {
fputcsv($handle, $product); /**
* Special handler for arrays inside the product array.
* To retain the data, we 'convert' the array to a string.
*/
foreach ($product as $subIdx => $subItem)
{
if (is_array($subItem))
{
$product[$subIdx] = implode(',', $subItem);
}
}
fputcsv($handle, $product, $csvSeparator);
} }
fclose($handle); fclose($handle);
return;
}
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
} }
/** /**