From 754a2a357a65ad9e265bd95c28fd865c06a2af05 Mon Sep 17 00:00:00 2001 From: Alex Thomassen Date: Mon, 31 Oct 2022 18:12:53 +0000 Subject: [PATCH] Refactor unit price handling + CSV/JSON output files --- .env.example | 3 +++ OdaScraper.php | 68 +++++++++++++++++++++++++++++++++----------------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/.env.example b/.env.example index f2d31a7..a58f6c7 100644 --- a/.env.example +++ b/.env.example @@ -10,5 +10,8 @@ PRODUCT_OUTPUT_FILE=./data/products.csv # Can be either 'json' or 'csv' PRODUCT_OUTPUT_FORMAT=json +# Only relevant if PRODUCT_OUTPUT_FORMAT is 'csv'. The CSV delimiter to use in the output file. +CSV_SEPARATOR=";" + # User agent sent with requests. Not sure if this is strictly necessary, but it doesn't hurt. USER_AGENT="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36" \ No newline at end of file diff --git a/OdaScraper.php b/OdaScraper.php index 370856f..0073e9c 100644 --- a/OdaScraper.php +++ b/OdaScraper.php @@ -207,21 +207,35 @@ class Oda $name = trim((string) $crawler->filter('h1')->text()); $price = (float) $crawler->filter('.price')->attr('content'); - $price_unit_string = trim((string) $crawler->filter('.unit-price')->text()); - ## Møkkete datavask på string begynn kr 50,50 per unit - $price_unit_string_nocurrency = trim($price_unit_string, 'kr '); // 50,50 per unit - $price_unit_string_dotfix = str_replace(',','.', $price_unit_string_nocurrency);# 50.50 per unit - $unit_data = explode(' per ',$price_unit_string_dotfix); // 50.50 unit + + /** + * Unit price handler. + * Example: "kr 187,50 per unit" + */ + $priceUnitText = trim((string) $crawler->filter('.unit-price')->text()); + + // Remove redundant text + $priceUnitText = str_replace(['kr ', ' per'], '', $priceUnitText); + + /** + * Split the price and unit. + * + * In theory the results should be something like: + * - [0] => 187.50 + * - [1] => unit + */ + $priceUnit = explode(' ', trim($priceUnitText)); return [ 'name' => $name, 'price' => $price, - 'unit_type' => $unit_data, + 'unit_price' => (float) $priceUnit[0], + 'unit' => $priceUnit[1], ]; } /** - * Writes the scraped data to a CSV file. + * Writes the scraped data to a CSV & a JSON file. * * @param array $products * @@ -233,36 +247,44 @@ class Oda return; } - $outputFormat = $_ENV['PRODUCT_OUTPUT_FORMAT']; $outputFile = $_ENV['PRODUCT_OUTPUT_FILE']; - if (empty($outputFormat) || empty($outputFile)) { + if (empty($outputFile)) { echo 'No output format or file specified in environment variables or .env, skipping...' . PHP_EOL; return; } - if ($outputFormat === 'json') { - file_put_contents($outputFile, json_encode($products)); - return; - } + // Output JSON + $jsonFile = str_replace('.csv', '.json', $outputFile); + file_put_contents($jsonFile, json_encode($products, JSON_PRETTY_PRINT)); - if ($outputFormat === 'csv') { - $handle = fopen($outputFile, 'w'); + $csvFile = str_replace('.json', '.csv', $jsonFile); + $handle = fopen($csvFile, 'w'); - // Add the first row as headers. Assuming all products are formatted the same, of course. - $columns = array_keys($products[0]); - fputcsv($handle, $columns); + // Add the first row as headers. Assuming all products are formatted the same, of course. + $csvSeparator = $_ENV['CSV_SEPARATOR'] ?? ';'; + $columns = array_keys($products[0]); + fputcsv($handle, $columns, $csvSeparator); - foreach ($products as $product) + + foreach ($products as $product) + { + /** + * Special handler for arrays inside the product array. + * To retain the data, we 'convert' the array to a string. + */ + foreach ($product as $subIdx => $subItem) { - fputcsv($handle, $product); + if (is_array($subItem)) + { + $product[$subIdx] = implode(',', $subItem); + } } - fclose($handle); - return; + fputcsv($handle, $product, $csvSeparator); } - echo sprintf('Invalid file output format: `%s`%s', $outputFormat, PHP_EOL); + fclose($handle); } /**