First version

This commit is contained in:
Florian Brinker 2021-04-20 00:19:48 +02:00
parent 3474d5a46e
commit 3cceb4bdd7
15 changed files with 2023 additions and 0 deletions

6
.gitignore vendored Normal file
View File

@ -0,0 +1,6 @@
/*.phar
/list.json
/config.php
/scrape.php
/vendor

14
composer.json Normal file
View File

@ -0,0 +1,14 @@
{
"name": "fbrinker/cookidoo-shopping-advanced",
"description": "A Cookidoo Shopping List Scraper and Parser for better results. It can unify and merge all ingredients by different rule sets.",
"type": "project",
"autoload": {
"psr-4": {
"CookidooShoppingAdvanced\\": "src/"
}
},
"require": {
"fabpot/goutte": "^4.0",
"nadar/stemming": "^1.0"
}
}

1390
composer.lock generated Normal file

File diff suppressed because it is too large Load Diff

8
config.sample.php Normal file
View File

@ -0,0 +1,8 @@
<?php
return [
'account' => [
'username' => 'foo',
'password' => 'bar',
]
];

97
src/CookidooScraper.php Normal file
View File

@ -0,0 +1,97 @@
<?php
namespace CookidooShoppingAdvanced;
use CookidooShoppingAdvanced\Models\Category;
use CookidooShoppingAdvanced\Models\Ingredient;
use CookidooShoppingAdvanced\Models\ShoppingList;
use Exception;
use Goutte\Client;
use Symfony\Component\HttpClient\HttpClient;
class CookidooScraper
{
private const PAGE_LOGIN = 'https://cookidoo.de/profile/de-DE/login?redirectAfterLogin=%2Ffoundation%2Fde-DE';
private const PAGE_LOGIN_BUTTON_ID = 'j_submit_id';
private const PAGE_LOGIN_FIELD_USER = 'j_username';
private const PAGE_LOGIN_FIELD_PASSWORD = 'j_password';
private const PAGE_SHOPPING_LIST = 'https://cookidoo.de/shopping/de-DE';
private const PAGE_SHOPPING_LIST_GROUP_TAG = 'pm-check-group';
private const PAGE_SHOPPING_LIST_GROUP_HEADLINE = 'h4.pm-check-group__title';
private const PAGE_SHOPPING_LIST_GROUP_INGREDIENT = 'li.pm-check-group__list-item';
private const PAGE_SHOPPING_LIST_GROUP_INGREDIENT_LABEL = 'span[data-type=ingredientNotation]';
private const PAGE_SHOPPING_LIST_GROUP_INGREDIENT_AMOUNT = 'span[data-type=value]';
private const PAGE_SHOPPING_LIST_GROUP_INGREDIENT_UNIT = 'span[data-type=unitNotation]';
private const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:87.0) Gecko/20100101 Firefox/87.0';
private Client $client;
private bool $login = false;
public function __construct() {
$this->client = new Client(HttpClient::create(['headers' => [
'user-agent' => self::USER_AGENT,
'accept' => "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
'accept-language' => "de,en-US;q=0.7,en;q=0.3",
], 'timeout' => 60]));
}
public function login(string $user, string $password): void {
try {
$crawler = $this->client->request('GET', self::PAGE_LOGIN);
$form = $crawler->selectButton(self::PAGE_LOGIN_BUTTON_ID)->form();
$submitResult = $this->client->submit($form, [
self::PAGE_LOGIN_FIELD_USER => $user,
self::PAGE_LOGIN_FIELD_PASSWORD => $password
]);
if (stripos($submitResult->getUri(), 'authentication_error=true') !== false) {
throw new Exception("Authentication error");
}
$this->login = true;
} catch (Exception $e) {
throw new Exception(sprintf("Cannot login: %s", $e->getMessage()));
}
}
public function getShopptingList(): ShoppingList {
if (!$this->login) {
throw new Exception('You need to login first.');
}
$crawler = $this->client->request('GET', self::PAGE_SHOPPING_LIST);
$crawler = $crawler->filter(self::PAGE_SHOPPING_LIST_GROUP_TAG);
$shoppingList = new ShoppingList();
$crawler->each(function ($parentCrawler) use($shoppingList) {
if (empty($parentCrawler->text(''))) {
return;
}
$headline = $parentCrawler->filter(self::PAGE_SHOPPING_LIST_GROUP_HEADLINE);
if (empty($headline->text(''))) {
return;
}
$category = new Category($headline->text());
$ingredients = $parentCrawler->filter(self::PAGE_SHOPPING_LIST_GROUP_INGREDIENT);
$ingredients->each(function ($ingredient) use($shoppingList, $category) {
$ingredient = new Ingredient(
$category,
$ingredient->filter(self::PAGE_SHOPPING_LIST_GROUP_INGREDIENT_LABEL)->text(''),
$ingredient->filter(self::PAGE_SHOPPING_LIST_GROUP_INGREDIENT_AMOUNT)->text(0),
$ingredient->filter(self::PAGE_SHOPPING_LIST_GROUP_INGREDIENT_UNIT)->text(''),
);
if (!$ingredient->isValid()) {
throw new \Exception("Invalid ingredient: $ingredient");
}
$shoppingList->add($ingredient);
});
});
return $shoppingList;
}
}

21
src/Models/Category.php Normal file
View File

@ -0,0 +1,21 @@
<?php
namespace CookidooShoppingAdvanced\Models;
class Category implements \JsonSerializable {
private string $name;
public function __construct(string $name)
{
$this->name = $name;
}
public function getName(): string {
return $this->name;
}
public function jsonSerialize() {
return [
'name' => $this->name,
];
}
}

106
src/Models/Ingredient.php Normal file
View File

@ -0,0 +1,106 @@
<?php
namespace CookidooShoppingAdvanced\Models;
use CookidooShoppingAdvanced\Normalizer;
class Ingredient implements \JsonSerializable {
private string $id;
private Category $category;
private string $name;
private string $amount;
private string $unit;
private string $cleanName = '';
private array $merged = [];
public function __construct(
Category $category,
string $name,
string $amount = '',
string $unit = '')
{
$this->id = uniqid();
$this->category = $category;
$this->name = $name;
$this->amount = $amount;
$this->unit = $unit;
$this->merged[] = clone $this;
}
public function isValid(): bool {
return !empty($this->name);
}
public function getId(): string {
return $this->id;
}
public function getCategory(): Category {
return $this->category;
}
public function getName(): string {
return $this->name;
}
public function getCleanName(): string {
if (!empty($this->cleanName)) {
return $this->cleanName;
}
$this->cleanName = Normalizer::normalize($this->name);
return $this->cleanName;
}
public function setCleanName(string $cleanName): void {
$this->cleanName = $cleanName;
}
public function getAmount(): string {
return $this->amount;
}
public function setAmount(string $amount): void {
$this->amount = $amount;
}
public function getUnit(): string {
return $this->unit;
}
// debug info
public function addMerged(Ingredient $ingredient) {
$this->merged[] = $ingredient;
}
public function getMerged(): array {
if (count($this->merged) === 1) {
return [];
}
return $this->merged;
}
public function __toString(): string
{
$amount = $this->getAmount() ?: '';
$unit = $this->getUnit() ?: '';
$amountAndUnit = trim(sprintf('%s %s', $amount, $unit));
if (!empty($amountAndUnit)) {
$amountAndUnit = ' ' . $amountAndUnit;
}
return $this->getName() . $amountAndUnit;
}
public function jsonSerialize() {
return [
'name' => $this->name,
'amount' => $this->amount,
'unit' => $this->unit,
'category' => $this->category,
];
}
}

120
src/Models/ShoppingList.php Normal file
View File

@ -0,0 +1,120 @@
<?php
namespace CookidooShoppingAdvanced\Models;
class ShoppingList implements \ArrayAccess, \Iterator {
/** @var Ingredient[] */
private array $ingredients = [];
private int $index = 0;
public function add(Ingredient $ingredient) {
$this->ingredients[] = $ingredient;
}
public function remove(Ingredient $ingredientToRemove): void {
$result = [];
foreach($this->ingredients as $ingredient) {
if ($ingredient->getId() === $ingredientToRemove->getId()) {
continue;
}
$result[] = $ingredient;
}
$this->ingredients = $result;
}
public function update(Ingredient $ingredientToUpdate): void {
foreach($this->ingredients as $key => $ingredient) {
if ($ingredient->getId() === $ingredientToUpdate->getId()) {
$this->ingredients[$key] = $ingredientToUpdate;
return;
}
}
}
public function get(): array {
return $this->ingredients;
}
public function getByCategory(): array {
$result = [];
foreach($this->ingredients as $ingredient) {
if (!isset($result[$ingredient->getCategory()->getName()])) {
$result[$ingredient->getCategory()->getName()] = [];
}
$result[$ingredient->getCategory()->getName()][] = $ingredient;
}
return $result;
}
public function isEmpty(): bool {
return empty($this->ingredients);
}
public function rewind() {
$this->index = 0;
}
public function current() {
return $this->ingredients[$this->index];
}
public function key() {
return $this->index;
}
public function next() {
++$this->index;
}
public function valid(): bool {
return isset($this->ingredients[$this->index]);
}
public function offsetSet($offset, $value) {
if (is_null($offset)) {
$this->ingredients[] = $value;
} else {
$this->ingredients[$offset] = $value;
}
}
public function offsetExists($offset) {
return isset($this->ingredients[$offset]);
}
public function offsetUnset($offset) {
unset($this->ingredients[$offset]);
}
public function offsetGet($offset) {
return isset($this->ingredients[$offset]) ? $this->ingredients[$offset] : null;
}
public function toJson(): string {
return json_encode($this->ingredients);
}
public static function fromJson(string $json): self {
$list = new self();
if (empty($json)) {
return $list;
}
$data = json_decode($json);
foreach($data as $entry) {
$ingredient = new Ingredient(
new Category($entry->category->name),
$entry->name,
$entry->amount,
$entry->unit,
);
$list->add($ingredient);
}
return $list;
}
}

13
src/Normalizer.php Normal file
View File

@ -0,0 +1,13 @@
<?php
namespace CookidooShoppingAdvanced;
use Nadar\Stemming\Stemm;
class Normalizer {
public static function normalize(string $text) {
$cleaned = preg_replace('/[^\p{L}0-9 ]/u', '', $text);
$trimmed = preg_replace('/\s+/', ' ', $cleaned);
return Stemm::stemPhrase($trimmed, 'de');
}
}

17
src/RuleApplicator.php Normal file
View File

@ -0,0 +1,17 @@
<?php
namespace CookidooShoppingAdvanced;
use CookidooShoppingAdvanced\Models\ShoppingList;
use CookidooShoppingAdvanced\Rules\AbstractRule;
class RuleApplicator {
public function applyRules(ShoppingList &$shoppingList, array $rules): void {
foreach($rules as $rule) {
$this->applyRule($shoppingList, $rule);
}
}
public function applyRule(ShoppingList &$shoppingList, AbstractRule $rule): void {
$rule->filter($shoppingList);
}
}

View File

@ -0,0 +1,8 @@
<?php
namespace CookidooShoppingAdvanced\Rules;
use CookidooShoppingAdvanced\Models\ShoppingList;
abstract class AbstractRule {
abstract public function filter(ShoppingList &$shoppingList): void;
}

133
src/Rules/MergeSame.php Normal file
View File

@ -0,0 +1,133 @@
<?php
namespace CookidooShoppingAdvanced\Rules;
use CookidooShoppingAdvanced\Models\Ingredient;
use CookidooShoppingAdvanced\Models\ShoppingList;
use Exception;
use Nadar\Stemming\Stemm;
class MergeSame extends AbstractRule {
private const STRATEGY_WITHOUT_AMOUNT = 'withoutAmount';
private const STRATEGY_BY_UNIT = 'byUnit';
public function filter(ShoppingList &$shoppingList): void {
if ($shoppingList->isEmpty()) {
return;
}
$mergableIngredientsByStrategy = $this->groupByMergeStrategy(
$this->extractMergableGroups($shoppingList)
);
$this->merge($shoppingList, $mergableIngredientsByStrategy);
}
private function extractMergableGroups(ShoppingList $shoppingList): array {
$groupedByNames = [];
foreach ($shoppingList as $ingredient) {
$key = Stemm::stem($ingredient->getCleanName(), 'de');
if (!isset($groupedByNames[$key])) {
$groupedByNames[$key] = [];
}
$groupedByNames[$key][] = $ingredient;
}
$mergableIngredients = array_filter($groupedByNames, static function(array $group): bool {
return count($group) > 1;
});
return $mergableIngredients;
}
private function groupByMergeStrategy(array $mergableIngredients): array {
$strategyGrouped = [];
foreach($mergableIngredients as $name => $ingredientGroup) {
if (!isset($strategyGrouped[$name])) {
$strategyGrouped[$name] = [];
}
foreach($ingredientGroup as $ingredient) {
$strategy = $this->detectStrategy($ingredient);
$strategyGrouped[$name][$strategy][] = $ingredient;
}
}
return $strategyGrouped;
}
private function detectStrategy(Ingredient $ingredient): string {
if (empty($ingredient->getAmount())) {
return self::STRATEGY_WITHOUT_AMOUNT;
}
return self::STRATEGY_BY_UNIT;
}
private function merge(ShoppingList &$shoppingList, array $mergableIngredientsByStrategy) {
foreach($mergableIngredientsByStrategy as $ingredientGroup) {
foreach($ingredientGroup as $strategy => $ingredientList) {
switch($strategy) {
case self::STRATEGY_WITHOUT_AMOUNT:
$this->applyStrategyWithoutAmount($shoppingList, $ingredientList);
break;
case self::STRATEGY_BY_UNIT:
$this->applyStrategyByUnit($shoppingList, $ingredientList);
break;
}
}
}
}
/**
* WithoutAmount Strategy
* Keep the first entry only, remove the others
*/
private function applyStrategyWithoutAmount(ShoppingList &$shoppingList, array $ingredientGroup): void {
$keep = reset($ingredientGroup);
$ingredientsToRemove = array_slice($ingredientGroup, 1);
foreach($ingredientsToRemove as $ingredient) {
$keep->addMerged($ingredient);
$shoppingList->remove($ingredient);
}
}
/**
* By Unit Strategy
* Group ingredients with the same unit
*/
private function applyStrategyByUnit(ShoppingList &$shoppingList, array $ingredientGroup): void {
$byUnit = [];
foreach($ingredientGroup as $ingredient) {
$key = Stemm::stem($ingredient->getUnit(), 'de');
if (!isset($byUnit[$key])) {
$byUnit[$key] = [];
}
$byUnit[$key][] = $ingredient;
}
foreach($byUnit as $ingredients) {
$baseIngredient = $ingredients[0];
$ingredientsToRemove = array_slice($ingredients, 1);
foreach($ingredientsToRemove as $ingredientToRemove) {
$amount = $ingredientToRemove->getAmount();
// check for ranges
if (strpos($amount, '-') !== false) {
$amounts = array_map('trim', explode('-', $amount));
$amount = array_pop($amounts);
}
if (!is_numeric($amount)) {
throw new Exception(sprintf('Cannot merge non numeric amounts (%s) of ingredients: %s', $amount, $ingredientToRemove));
}
$newAmount = (float)$baseIngredient->getAmount() + (float)$ingredientToRemove->getAmount();
$baseIngredient->setAmount($newAmount);
$baseIngredient->addMerged($ingredientToRemove);
$shoppingList->remove($ingredientToRemove);
}
}
}
}

23
src/Rules/Omit.php Normal file
View File

@ -0,0 +1,23 @@
<?php
namespace CookidooShoppingAdvanced\Rules;
use CookidooShoppingAdvanced\Models\ShoppingList;
use Nadar\Stemming\Stemm;
class Omit extends AbstractRule {
private const INGREDIENT_NAMES_TO_OMIT = [
"Wasser",
];
public function filter(ShoppingList &$shoppingList): void {
$omit = array_flip(array_map(static function($item) {
return Stemm::stem($item, 'de');
}, self::INGREDIENT_NAMES_TO_OMIT));
foreach ($shoppingList as $ingredient) {
if (isset($omit[$ingredient->getCleanName()])) {
$shoppingList->remove($ingredient);
}
}
}
}

57
src/Rules/Simplify.php Normal file
View File

@ -0,0 +1,57 @@
<?php
namespace CookidooShoppingAdvanced\Rules;
use CookidooShoppingAdvanced\Models\ShoppingList;
use CookidooShoppingAdvanced\Normalizer;
class Simplify extends AbstractRule {
private const SIMPLIFICATION_OMISSION = [ // obey the replacement order
'frisch gemahlen' => '',
'gemahlen' => '',
'selbst gemacht' => '',
'konserviert' => '',
'vegan' => '',
'mittelscharf' => '',
];
private const SIMPLIFICATION_MAPPING = [ // obey the replacement order
'Cayenne-Pfeffer' => 'Pfeffer',
'Paprikaschote' => 'Paprika',
'Schalotte' => 'Zwiebel',
'Erdäpfel' => 'Kartoffel',
'Paprikapulver' => 'Paprika',
'Essiggurkerl' => 'Essiggurken',
];
// Words the stemming algorithm cannot reduce
private const SIMPLIFICATION_STEMMS = [ // obey the replacement order
'Zwiebeln' => 'Zwiebel',
'Kartoffeln' => 'Kartoffel',
];
public function filter(ShoppingList &$shoppingList): void {
foreach($shoppingList as $key => $ingredient) {
$cleanName = $ingredient->getCleanName();
$search = $replace = [];
$mappings = array_merge(
self::SIMPLIFICATION_OMISSION,
self::SIMPLIFICATION_MAPPING,
self::SIMPLIFICATION_STEMMS
);
foreach($mappings as $simplify => $with) {
$simplify = Normalizer::normalize($simplify);
if (strpos($cleanName, $simplify) !== false) {
$search[] = $simplify;
$replace[] = Normalizer::normalize($with);
}
}
if (!empty($search)) {
$ingredient->setCleanName(trim(str_replace($search, $replace, $cleanName)));
$shoppingList[$key] = $ingredient;
}
}
}
}

10
src/Rules/Unify.php Normal file
View File

@ -0,0 +1,10 @@
<?php
namespace CookidooShoppingAdvanced\Rules;
use CookidooShoppingAdvanced\Models\ShoppingList;
class Unify extends AbstractRule {
public function filter(ShoppingList &$shoppingList): void {
// todo: implement
}
}