Compare commits
2 Commits
Author | SHA1 | Date | |
---|---|---|---|
754a2a357a | |||
c978ceb26a |
@ -10,5 +10,8 @@ PRODUCT_OUTPUT_FILE=./data/products.csv
|
||||
# Can be either 'json' or 'csv'
|
||||
PRODUCT_OUTPUT_FORMAT=json
|
||||
|
||||
# Only relevant if PRODUCT_OUTPUT_FORMAT is 'csv'. The CSV delimiter to use in the output file.
|
||||
CSV_SEPARATOR=";"
|
||||
|
||||
# User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt.
|
||||
USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36"
|
@ -207,21 +207,35 @@ class Oda
|
||||
|
||||
$name = trim((string) $crawler->filter('h1')->text());
|
||||
$price = (float) $crawler->filter('.price')->attr('content');
|
||||
$price_unit_string = trim((string) $crawler->filter('.unit-price')->text());
|
||||
## Møkkete datavask på string begynn kr 50,50 per unit
|
||||
$price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit
|
||||
$price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit
|
||||
$unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit
|
||||
|
||||
/**
|
||||
* Unit price handler.
|
||||
* Example: "kr 187,50 per unit"
|
||||
*/
|
||||
$priceUnitText = trim((string) $crawler->filter('.unit-price')->text());
|
||||
|
||||
// Remove redundant text
|
||||
$priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText);
|
||||
|
||||
/**
|
||||
* Split the price and unit.
|
||||
*
|
||||
* In theory the results should be something like:
|
||||
* - [0] => 187.50
|
||||
* - [1] => unit
|
||||
*/
|
||||
$priceUnit = explode(' ', trim($priceUnitText));
|
||||
|
||||
return [
|
||||
'name' => $name,
|
||||
'price' => $price,
|
||||
'unit_type' => $unit_data,
|
||||
'unit_price' => (float) $priceUnit[0],
|
||||
'unit' => $priceUnit[1],
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Writes the scraped data to a CSV file.
|
||||
* Writes the scraped data to a CSV & a JSON file.
|
||||
*
|
||||
* @param array $products
|
||||
*
|
||||
@ -233,36 +247,44 @@ class Oda
|
||||
return;
|
||||
}
|
||||
|
||||
$outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT'];
|
||||
$outputFile = $_ENV['PRODUCT_OUTPUT_FILE'];
|
||||
|
||||
if (empty($outputFormat) || empty($outputFile)) {
|
||||
if (empty($outputFile)) {
|
||||
echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL;
|
||||
return;
|
||||
}
|
||||
|
||||
if ($outputFormat === 'json') {
|
||||
file_put_contents($outputFile, json_encode($products));
|
||||
return;
|
||||
}
|
||||
// Output JSON
|
||||
$jsonFile = str_replace('.csv', '.json', $outputFile);
|
||||
file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT));
|
||||
|
||||
if ($outputFormat === 'csv') {
|
||||
$handle = fopen($outputFile, 'w');
|
||||
$csvFile = str_replace('.json', '.csv', $jsonFile);
|
||||
$handle = fopen($csvFile, 'w');
|
||||
|
||||
// Add the first row as headers. Assuming all products are formatted the same, of course.
|
||||
$csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';';
|
||||
$columns = array_keys($products[0]);
|
||||
fputcsv($handle, $columns);
|
||||
fputcsv($handle, $columns, $csvSeparator);
|
||||
|
||||
|
||||
foreach ($products as $product)
|
||||
{
|
||||
fputcsv($handle, $product);
|
||||
/**
|
||||
* Special handler for arrays inside the product array.
|
||||
* To retain the data, we 'convert' the array to a string.
|
||||
*/
|
||||
foreach ($product as $subIdx => $subItem)
|
||||
{
|
||||
if (is_array($subItem))
|
||||
{
|
||||
$product[$subIdx] = implode(',', $subItem);
|
||||
}
|
||||
}
|
||||
|
||||
fputcsv($handle, $product, $csvSeparator);
|
||||
}
|
||||
|
||||
fclose($handle);
|
||||
return;
|
||||
}
|
||||
|
||||
echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL);
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
Reference in New Issue
Block a user