Refactor unit price handling + CSV/JSON output files
This commit is contained in:
parent
c978ceb26a
commit
754a2a357a
@ -10,5 +10,8 @@ PRODUCT_OUTPUT_FILE=./data/products.csv
|
|||||||
# Can be either 'json' or 'csv'
|
# Can be either 'json' or 'csv'
|
||||||
PRODUCT_OUTPUT_FORMAT=json
|
PRODUCT_OUTPUT_FORMAT=json
|
||||||
|
|
||||||
|
# Only relevant if PRODUCT_OUTPUT_FORMAT is 'csv'. The CSV delimiter to use in the output file.
|
||||||
|
CSV_SEPARATOR=";"
|
||||||
|
|
||||||
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
|
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
|
||||||
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
|
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
|
@ -207,21 +207,35 @@ class Oda
|
|||||||
|
|
||||||
$name = trim((string) $crawler->filter('h1')->text());
|
$name = trim((string) $crawler->filter('h1')->text());
|
||||||
$price = (float) $crawler->filter('.price')->attr('content');
|
$price = (float) $crawler->filter('.price')->attr('content');
|
||||||
$price_unit_string = trim((string) $crawler->filter('.unit-price')->text());
|
|
||||||
## Møkkete datavask på string begynn kr 50,50 per unit
|
/**
|
||||||
$price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit
|
* Unit price handler.
|
||||||
$price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit
|
* Example: "kr 187,50 per unit"
|
||||||
$unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit
|
*/
|
||||||
|
$priceUnitText = trim((string) $crawler->filter('.unit-price')->text());
|
||||||
|
|
||||||
|
// Remove redundant text
|
||||||
|
$priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Split the price and unit.
|
||||||
|
*
|
||||||
|
* In theory the results should be something like:
|
||||||
|
* - [0] => 187.50
|
||||||
|
* - [1] => unit
|
||||||
|
*/
|
||||||
|
$priceUnit = explode(' ', trim($priceUnitText));
|
||||||
|
|
||||||
return [
|
return [
|
||||||
'name' => $name,
|
'name' => $name,
|
||||||
'price' => $price,
|
'price' => $price,
|
||||||
'unit_type' => $unit_data,
|
'unit_price' => (float) $priceUnit[0],
|
||||||
|
'unit' => $priceUnit[1],
|
||||||
];
|
];
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Writes the scraped data to a CSV file.
|
* Writes the scraped data to a CSV & a JSON file.
|
||||||
*
|
*
|
||||||
* @param array $products
|
* @param array $products
|
||||||
*
|
*
|
||||||
@ -233,36 +247,44 @@ class Oda
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
|
|
||||||
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
|
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
|
||||||
|
|
||||||
if (empty($outputFormat) || empty($outputFile)) {
|
if (empty($outputFile)) {
|
||||||
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
|
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if ($outputFormat === 'json') {
|
// Output JSON
|
||||||
file_put_contents($outputFile, json_encode($products));
|
$jsonFile = str_replace('.csv', '.json', $outputFile);
|
||||||
return;
|
file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT));
|
||||||
}
|
|
||||||
|
|
||||||
if ($outputFormat === 'csv') {
|
$csvFile = str_replace('.json', '.csv', $jsonFile);
|
||||||
$handle = fopen($outputFile, 'w');
|
$handle = fopen($csvFile, 'w');
|
||||||
|
|
||||||
// Add the first row as headers. Assuming all products are formatted the same, of course.
|
// Add the first row as headers. Assuming all products are formatted the same, of course.
|
||||||
|
$csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';';
|
||||||
$columns = array_keys($products[0]);
|
$columns = array_keys($products[0]);
|
||||||
fputcsv($handle, $columns);
|
fputcsv($handle, $columns, $csvSeparator);
|
||||||
|
|
||||||
|
|
||||||
foreach ($products as $product)
|
foreach ($products as $product)
|
||||||
{
|
{
|
||||||
fputcsv($handle, $product);
|
/**
|
||||||
|
* Special handler for arrays inside the product array.
|
||||||
|
* To retain the data, we 'convert' the array to a string.
|
||||||
|
*/
|
||||||
|
foreach ($product as $subIdx => $subItem)
|
||||||
|
{
|
||||||
|
if (is_array($subItem))
|
||||||
|
{
|
||||||
|
$product[$subIdx] = implode(',', $subItem);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fputcsv($handle, $product, $csvSeparator);
|
||||||
}
|
}
|
||||||
|
|
||||||
fclose($handle);
|
fclose($handle);
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
Loading…
Reference in New Issue
Block a user