forked from cartsnitch/cartsnitch
145 lines
5.0 KiB
Python
145 lines
5.0 KiB
Python
"""Receipt normalization — parse raw Meijer scraper output into purchase records.
|
|
|
|
Maps raw receipt fields, cleans product names, extracts quantities/units.
|
|
"""
|
|
|
|
import re
|
|
from datetime import date
|
|
from decimal import Decimal, InvalidOperation
|
|
|
|
from cartsnitch_common.schemas.purchase import PurchaseCreate, PurchaseItemCreate
|
|
|
|
|
|
def _clean_product_name(raw: str) -> str:
|
|
"""Clean raw product name from scraper output."""
|
|
cleaned = raw.strip()
|
|
# Remove leading/trailing non-alphanumeric chars
|
|
cleaned = re.sub(r"^\W+|\W+$", "", cleaned)
|
|
# Collapse internal whitespace
|
|
cleaned = re.sub(r"\s+", " ", cleaned)
|
|
return cleaned
|
|
|
|
|
|
def _safe_decimal(
|
|
value: str | float | int | Decimal | None,
|
|
default: Decimal = Decimal("0"),
|
|
) -> Decimal:
|
|
"""Safely convert a value to Decimal."""
|
|
if value is None:
|
|
return default
|
|
try:
|
|
return Decimal(str(value))
|
|
except (InvalidOperation, ValueError):
|
|
return default
|
|
|
|
|
|
def parse_meijer_item(raw_item: dict) -> PurchaseItemCreate:
|
|
"""Parse a single Meijer scraper line item into a PurchaseItemCreate.
|
|
|
|
Expected raw_item keys (from Meijer scraper):
|
|
- description / name: product name
|
|
- upc / upcCode: UPC barcode
|
|
- quantity / qty: number of units
|
|
- unitPrice / price: per-unit price
|
|
- extendedPrice / totalPrice: line total
|
|
- regularPrice: shelf price before discounts
|
|
- salePrice: sale price if applicable
|
|
- couponAmount / couponDiscount: coupon savings
|
|
- loyaltyAmount / loyaltyDiscount: loyalty savings
|
|
- category / department: raw category
|
|
"""
|
|
name = raw_item.get("description") or raw_item.get("name") or ""
|
|
cleaned_name = _clean_product_name(name)
|
|
|
|
upc = raw_item.get("upc") or raw_item.get("upcCode")
|
|
if upc:
|
|
upc = str(upc).strip().lstrip("0") or str(upc).strip()
|
|
|
|
qty = _safe_decimal(
|
|
raw_item.get("quantity") or raw_item.get("qty"),
|
|
default=Decimal("1"),
|
|
)
|
|
|
|
unit_price = _safe_decimal(raw_item.get("unitPrice") or raw_item.get("price"))
|
|
extended = _safe_decimal(raw_item.get("extendedPrice") or raw_item.get("totalPrice"))
|
|
if extended == Decimal("0") and unit_price > 0:
|
|
extended = unit_price * qty
|
|
|
|
regular = raw_item.get("regularPrice")
|
|
sale = raw_item.get("salePrice")
|
|
coupon = raw_item.get("couponAmount") or raw_item.get("couponDiscount")
|
|
loyalty = raw_item.get("loyaltyAmount") or raw_item.get("loyaltyDiscount")
|
|
category = raw_item.get("category") or raw_item.get("department")
|
|
|
|
return PurchaseItemCreate(
|
|
product_name_raw=cleaned_name,
|
|
upc=upc,
|
|
quantity=qty,
|
|
unit_price=unit_price,
|
|
extended_price=extended,
|
|
regular_price=_safe_decimal(regular) if regular is not None else None,
|
|
sale_price=_safe_decimal(sale) if sale is not None else None,
|
|
coupon_discount=_safe_decimal(coupon) if coupon is not None else None,
|
|
loyalty_discount=_safe_decimal(loyalty) if loyalty is not None else None,
|
|
category_raw=str(category).strip() if category else None,
|
|
)
|
|
|
|
|
|
def normalize_receipt(
|
|
raw_receipt: dict,
|
|
user_id: str,
|
|
store_id: str,
|
|
) -> PurchaseCreate:
|
|
"""Parse a complete Meijer raw receipt into a PurchaseCreate.
|
|
|
|
Expected raw_receipt keys:
|
|
- receiptId / receipt_id / id: unique receipt identifier
|
|
- date / purchaseDate / purchase_date: purchase date (YYYY-MM-DD or similar)
|
|
- total / totalAmount: receipt total
|
|
- subtotal: pre-tax subtotal
|
|
- tax / taxAmount: tax amount
|
|
- savings / totalSavings: total discount savings
|
|
- items: list of raw line item dicts
|
|
"""
|
|
import uuid
|
|
|
|
receipt_id = str(
|
|
raw_receipt.get("receiptId")
|
|
or raw_receipt.get("receipt_id")
|
|
or raw_receipt.get("id")
|
|
or uuid.uuid4()
|
|
)
|
|
|
|
raw_date = (
|
|
raw_receipt.get("date")
|
|
or raw_receipt.get("purchaseDate")
|
|
or raw_receipt.get("purchase_date")
|
|
)
|
|
if isinstance(raw_date, str):
|
|
purchase_date = date.fromisoformat(raw_date[:10])
|
|
elif isinstance(raw_date, date):
|
|
purchase_date = raw_date
|
|
else:
|
|
purchase_date = date.today()
|
|
|
|
total = _safe_decimal(raw_receipt.get("total") or raw_receipt.get("totalAmount"))
|
|
subtotal = raw_receipt.get("subtotal")
|
|
tax = raw_receipt.get("tax") or raw_receipt.get("taxAmount")
|
|
savings = raw_receipt.get("savings") or raw_receipt.get("totalSavings")
|
|
|
|
raw_items = raw_receipt.get("items") or []
|
|
items = [parse_meijer_item(item) for item in raw_items]
|
|
|
|
return PurchaseCreate(
|
|
user_id=uuid.UUID(user_id) if isinstance(user_id, str) else user_id,
|
|
store_id=uuid.UUID(store_id) if isinstance(store_id, str) else store_id,
|
|
receipt_id=receipt_id,
|
|
purchase_date=purchase_date,
|
|
total=total,
|
|
subtotal=_safe_decimal(subtotal) if subtotal is not None else None,
|
|
tax=_safe_decimal(tax) if tax is not None else None,
|
|
savings_total=_safe_decimal(savings) if savings is not None else None,
|
|
raw_data=raw_receipt,
|
|
items=items,
|
|
)
|