Merge commit '4cf6f91e954b770198578bcb8db5d98ac964bfed' as 'common'

This commit is contained in:
Coupon Carl
2026-03-28 02:24:14 +00:00
66 changed files with 7044 additions and 0 deletions
@@ -0,0 +1,26 @@
"""Data pipeline — receipt normalization, product matching, price tracking, shrinkflation."""
from cartsnitch_common.pipeline.matching import (
ConfidenceLevel,
ProductMatcher,
match_purchase_item,
)
from cartsnitch_common.pipeline.price_tracking import (
PriceDelta,
get_price_trend,
record_price_from_item,
)
from cartsnitch_common.pipeline.receipt import normalize_receipt, parse_meijer_item
from cartsnitch_common.pipeline.shrinkflation import detect_shrinkflation
__all__ = [
"ConfidenceLevel",
"PriceDelta",
"ProductMatcher",
"detect_shrinkflation",
"get_price_trend",
"match_purchase_item",
"normalize_receipt",
"parse_meijer_item",
"record_price_from_item",
]
@@ -0,0 +1,136 @@
"""Product matching & dedup — UPC primary, fuzzy name fallback, confidence scoring.
Wraps the Phase 1 normalization module with confidence-level classification
and batch matching for purchase ingestion.
"""
import uuid
from dataclasses import dataclass
from sqlalchemy.orm import Session
from cartsnitch_common.constants import MatchConfidence
from cartsnitch_common.models.product import NormalizedProduct
from cartsnitch_common.normalization import (
MatchMethod,
MatchResult,
extract_size_info,
normalize_product,
)
from cartsnitch_common.schemas.purchase import PurchaseItemCreate
# Re-export for convenience
ConfidenceLevel = MatchConfidence
@dataclass(frozen=True)
class MatchOutcome:
"""Result of matching a single purchase item to a normalized product."""
item_index: int
match: MatchResult | None
confidence_level: MatchConfidence
created_new: bool = False
def classify_confidence(score: float, method: MatchMethod) -> MatchConfidence:
"""Classify a match score into high/medium/low confidence."""
if method == MatchMethod.UPC:
return MatchConfidence.HIGH
# Name-based matching thresholds
if score >= 0.8:
return MatchConfidence.HIGH
if score >= 0.5:
return MatchConfidence.MEDIUM
return MatchConfidence.LOW
def _create_product_from_item(
session: Session,
item: PurchaseItemCreate,
) -> NormalizedProduct:
"""Create a new NormalizedProduct from a purchase item that had no match."""
size_info = extract_size_info(item.product_name_raw)
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name=item.product_name_raw,
size=size_info[0] if size_info else None,
size_unit=size_info[1] if size_info else None,
upc_variants=[item.upc] if item.upc else [],
)
session.add(product)
session.flush()
return product
class ProductMatcher:
"""Batch product matcher for purchase ingestion.
Usage:
matcher = ProductMatcher(session)
outcomes = matcher.match_items(items)
"""
def __init__(
self,
session: Session,
name_threshold: float = 0.4,
auto_create: bool = True,
):
self.session = session
self.name_threshold = name_threshold
self.auto_create = auto_create
def match_single(
self,
item: PurchaseItemCreate,
) -> tuple[NormalizedProduct | None, MatchResult | None, MatchConfidence]:
"""Match a single purchase item to a normalized product.
Returns (product, match_result, confidence_level).
If auto_create is True and no match found, creates a new product.
"""
result = normalize_product(
self.session,
item.product_name_raw,
upc=item.upc,
name_threshold=self.name_threshold,
)
if result:
confidence = classify_confidence(result.confidence, result.method)
return result.product, result, confidence
if self.auto_create:
product = _create_product_from_item(self.session, item)
return product, None, MatchConfidence.LOW
return None, None, MatchConfidence.LOW
def match_items(self, items: list[PurchaseItemCreate]) -> list[MatchOutcome]:
"""Match a batch of purchase items. Returns outcomes in order."""
outcomes: list[MatchOutcome] = []
for idx, item in enumerate(items):
product, result, confidence = self.match_single(item)
created = result is None and product is not None
outcomes.append(
MatchOutcome(
item_index=idx,
match=result,
confidence_level=confidence,
created_new=created,
)
)
return outcomes
def match_purchase_item(
session: Session,
item: PurchaseItemCreate,
name_threshold: float = 0.4,
auto_create: bool = True,
) -> tuple[NormalizedProduct | None, MatchConfidence]:
"""Convenience function: match a single item, return (product, confidence)."""
matcher = ProductMatcher(session, name_threshold=name_threshold, auto_create=auto_create)
product, _, confidence = matcher.match_single(item)
return product, confidence
@@ -0,0 +1,130 @@
"""Price history tracking — record prices and detect deltas.
On each purchase ingestion, writes price_history records and detects
price changes from previous entries for the same product+store.
"""
import uuid
from dataclasses import dataclass
from datetime import date
from decimal import Decimal
from sqlalchemy import and_, select
from sqlalchemy.orm import Session
from cartsnitch_common.constants import PriceSource
from cartsnitch_common.models.price import PriceHistory
@dataclass(frozen=True)
class PriceDelta:
"""A detected price change for a product at a store."""
product_id: uuid.UUID
store_id: uuid.UUID
old_price: Decimal
new_price: Decimal
change_amount: Decimal
change_percent: Decimal
old_date: date
new_date: date
@property
def is_increase(self) -> bool:
return self.change_amount > 0
@property
def is_decrease(self) -> bool:
return self.change_amount < 0
def get_latest_price(
session: Session,
product_id: uuid.UUID,
store_id: uuid.UUID,
) -> PriceHistory | None:
"""Get the most recent price entry for a product at a store."""
stmt = (
select(PriceHistory)
.where(
and_(
PriceHistory.normalized_product_id == product_id,
PriceHistory.store_id == store_id,
)
)
.order_by(PriceHistory.observed_date.desc())
.limit(1)
)
return session.execute(stmt).scalar_one_or_none()
def record_price_from_item(
session: Session,
product_id: uuid.UUID,
store_id: uuid.UUID,
observed_date: date,
regular_price: Decimal,
sale_price: Decimal | None = None,
loyalty_price: Decimal | None = None,
coupon_price: Decimal | None = None,
purchase_item_id: uuid.UUID | None = None,
source: PriceSource = PriceSource.RECEIPT,
) -> tuple[PriceHistory, PriceDelta | None]:
"""Record a price observation and return any detected delta.
Returns (price_history_entry, price_delta_or_none).
"""
previous = get_latest_price(session, product_id, store_id)
entry = PriceHistory(
id=uuid.uuid4(),
normalized_product_id=product_id,
store_id=store_id,
observed_date=observed_date,
regular_price=regular_price,
sale_price=sale_price,
loyalty_price=loyalty_price,
coupon_price=coupon_price,
source=source,
purchase_item_id=purchase_item_id,
)
session.add(entry)
session.flush()
delta = None
if previous and previous.regular_price != regular_price:
change = regular_price - previous.regular_price
pct = (change / previous.regular_price * 100) if previous.regular_price else Decimal("0")
delta = PriceDelta(
product_id=product_id,
store_id=store_id,
old_price=previous.regular_price,
new_price=regular_price,
change_amount=change,
change_percent=pct.quantize(Decimal("0.01")),
old_date=previous.observed_date,
new_date=observed_date,
)
return entry, delta
def get_price_trend(
session: Session,
product_id: uuid.UUID,
store_id: uuid.UUID,
limit: int = 30,
) -> list[PriceHistory]:
"""Get recent price history for a product at a store, newest first."""
stmt = (
select(PriceHistory)
.where(
and_(
PriceHistory.normalized_product_id == product_id,
PriceHistory.store_id == store_id,
)
)
.order_by(PriceHistory.observed_date.desc())
.limit(limit)
)
return list(session.execute(stmt).scalars().all())
@@ -0,0 +1,144 @@
"""Receipt normalization — parse raw Meijer scraper output into purchase records.
Maps raw receipt fields, cleans product names, extracts quantities/units.
"""
import re
from datetime import date
from decimal import Decimal, InvalidOperation
from cartsnitch_common.schemas.purchase import PurchaseCreate, PurchaseItemCreate
def _clean_product_name(raw: str) -> str:
"""Clean raw product name from scraper output."""
cleaned = raw.strip()
# Remove leading/trailing non-alphanumeric chars
cleaned = re.sub(r"^\W+|\W+$", "", cleaned)
# Collapse internal whitespace
cleaned = re.sub(r"\s+", " ", cleaned)
return cleaned
def _safe_decimal(
value: str | float | int | Decimal | None,
default: Decimal = Decimal("0"),
) -> Decimal:
"""Safely convert a value to Decimal."""
if value is None:
return default
try:
return Decimal(str(value))
except (InvalidOperation, ValueError):
return default
def parse_meijer_item(raw_item: dict) -> PurchaseItemCreate:
"""Parse a single Meijer scraper line item into a PurchaseItemCreate.
Expected raw_item keys (from Meijer scraper):
- description / name: product name
- upc / upcCode: UPC barcode
- quantity / qty: number of units
- unitPrice / price: per-unit price
- extendedPrice / totalPrice: line total
- regularPrice: shelf price before discounts
- salePrice: sale price if applicable
- couponAmount / couponDiscount: coupon savings
- loyaltyAmount / loyaltyDiscount: loyalty savings
- category / department: raw category
"""
name = raw_item.get("description") or raw_item.get("name") or ""
cleaned_name = _clean_product_name(name)
upc = raw_item.get("upc") or raw_item.get("upcCode")
if upc:
upc = str(upc).strip().lstrip("0") or str(upc).strip()
qty = _safe_decimal(
raw_item.get("quantity") or raw_item.get("qty"),
default=Decimal("1"),
)
unit_price = _safe_decimal(raw_item.get("unitPrice") or raw_item.get("price"))
extended = _safe_decimal(raw_item.get("extendedPrice") or raw_item.get("totalPrice"))
if extended == Decimal("0") and unit_price > 0:
extended = unit_price * qty
regular = raw_item.get("regularPrice")
sale = raw_item.get("salePrice")
coupon = raw_item.get("couponAmount") or raw_item.get("couponDiscount")
loyalty = raw_item.get("loyaltyAmount") or raw_item.get("loyaltyDiscount")
category = raw_item.get("category") or raw_item.get("department")
return PurchaseItemCreate(
product_name_raw=cleaned_name,
upc=upc,
quantity=qty,
unit_price=unit_price,
extended_price=extended,
regular_price=_safe_decimal(regular) if regular is not None else None,
sale_price=_safe_decimal(sale) if sale is not None else None,
coupon_discount=_safe_decimal(coupon) if coupon is not None else None,
loyalty_discount=_safe_decimal(loyalty) if loyalty is not None else None,
category_raw=str(category).strip() if category else None,
)
def normalize_receipt(
raw_receipt: dict,
user_id: str,
store_id: str,
) -> PurchaseCreate:
"""Parse a complete Meijer raw receipt into a PurchaseCreate.
Expected raw_receipt keys:
- receiptId / receipt_id / id: unique receipt identifier
- date / purchaseDate / purchase_date: purchase date (YYYY-MM-DD or similar)
- total / totalAmount: receipt total
- subtotal: pre-tax subtotal
- tax / taxAmount: tax amount
- savings / totalSavings: total discount savings
- items: list of raw line item dicts
"""
import uuid
receipt_id = str(
raw_receipt.get("receiptId")
or raw_receipt.get("receipt_id")
or raw_receipt.get("id")
or uuid.uuid4()
)
raw_date = (
raw_receipt.get("date")
or raw_receipt.get("purchaseDate")
or raw_receipt.get("purchase_date")
)
if isinstance(raw_date, str):
purchase_date = date.fromisoformat(raw_date[:10])
elif isinstance(raw_date, date):
purchase_date = raw_date
else:
purchase_date = date.today()
total = _safe_decimal(raw_receipt.get("total") or raw_receipt.get("totalAmount"))
subtotal = raw_receipt.get("subtotal")
tax = raw_receipt.get("tax") or raw_receipt.get("taxAmount")
savings = raw_receipt.get("savings") or raw_receipt.get("totalSavings")
raw_items = raw_receipt.get("items") or []
items = [parse_meijer_item(item) for item in raw_items]
return PurchaseCreate(
user_id=uuid.UUID(user_id) if isinstance(user_id, str) else user_id,
store_id=uuid.UUID(store_id) if isinstance(store_id, str) else store_id,
receipt_id=receipt_id,
purchase_date=purchase_date,
total=total,
subtotal=_safe_decimal(subtotal) if subtotal is not None else None,
tax=_safe_decimal(tax) if tax is not None else None,
savings_total=_safe_decimal(savings) if savings is not None else None,
raw_data=raw_receipt,
items=items,
)
@@ -0,0 +1,165 @@
"""Shrinkflation detection — compare unit sizes across price history.
Flags cases where a product's size decreased while price stayed flat or increased.
"""
import uuid
from dataclasses import dataclass
from datetime import date
from decimal import Decimal
from sqlalchemy import and_, select
from sqlalchemy.orm import Session
from cartsnitch_common.constants import SizeUnit
from cartsnitch_common.models.product import NormalizedProduct
from cartsnitch_common.models.shrinkflation import ShrinkflationEvent
# Conversion factors to a common base unit (grams for weight, ml for volume, count for discrete)
_WEIGHT_TO_GRAMS: dict[SizeUnit, Decimal] = {
SizeUnit.G: Decimal("1"),
SizeUnit.KG: Decimal("1000"),
SizeUnit.OZ: Decimal("28.3495"),
SizeUnit.LB: Decimal("453.592"),
}
_VOLUME_TO_ML: dict[SizeUnit, Decimal] = {
SizeUnit.ML: Decimal("1"),
SizeUnit.L: Decimal("1000"),
SizeUnit.FL_OZ: Decimal("29.5735"),
}
_COUNT_UNITS: set[SizeUnit] = {SizeUnit.CT, SizeUnit.PK}
def _to_comparable(size: str, unit: SizeUnit) -> Decimal | None:
"""Convert a size+unit to a comparable numeric value.
Returns None if units are not comparable (different measurement systems).
"""
try:
size_val = Decimal(size)
except Exception:
return None
if unit in _WEIGHT_TO_GRAMS:
return size_val * _WEIGHT_TO_GRAMS[unit]
if unit in _VOLUME_TO_ML:
return size_val * _VOLUME_TO_ML[unit]
if unit in _COUNT_UNITS:
return size_val
return None
def _units_comparable(unit_a: SizeUnit, unit_b: SizeUnit) -> bool:
"""Check if two units are in the same measurement system."""
if unit_a in _WEIGHT_TO_GRAMS and unit_b in _WEIGHT_TO_GRAMS:
return True
if unit_a in _VOLUME_TO_ML and unit_b in _VOLUME_TO_ML:
return True
return unit_a in _COUNT_UNITS and unit_b in _COUNT_UNITS
@dataclass(frozen=True)
class ShrinkflationCandidate:
"""A potential shrinkflation detection before writing to DB."""
product: NormalizedProduct
old_size: str
new_size: str
old_unit: SizeUnit
new_unit: SizeUnit
old_price: Decimal | None
new_price: Decimal | None
confidence: Decimal
size_change_pct: Decimal
def detect_shrinkflation(
session: Session,
product: NormalizedProduct,
new_size: str,
new_unit: SizeUnit,
new_price: Decimal | None = None,
detected_date: date | None = None,
min_size_decrease_pct: Decimal = Decimal("1"),
) -> ShrinkflationEvent | None:
"""Check if a product's size has decreased (shrinkflation).
Compares the new size against the product's recorded size.
If size decreased while price stayed flat or increased, records a shrinkflation event.
Returns the ShrinkflationEvent if detected, None otherwise.
"""
if not product.size or not product.size_unit:
return None
old_unit = SizeUnit(product.size_unit)
if not _units_comparable(old_unit, new_unit):
return None
old_comparable = _to_comparable(product.size, old_unit)
new_comparable = _to_comparable(new_size, new_unit)
if old_comparable is None or new_comparable is None:
return None
if new_comparable >= old_comparable:
return None # Size didn't decrease
size_change_pct = ((old_comparable - new_comparable) / old_comparable * 100).quantize(
Decimal("0.01")
)
if size_change_pct < min_size_decrease_pct:
return None
# Check existing events to avoid duplicates
existing = session.execute(
select(ShrinkflationEvent).where(
and_(
ShrinkflationEvent.normalized_product_id == product.id,
ShrinkflationEvent.old_size == product.size,
ShrinkflationEvent.new_size == new_size,
)
)
).scalar_one_or_none()
if existing:
return existing
# Confidence: higher if size change is significant and price didn't drop
confidence = Decimal("0.70")
if size_change_pct >= Decimal("5"):
confidence = Decimal("0.85")
if size_change_pct >= Decimal("10"):
confidence = Decimal("0.95")
# Get the last known price for comparison
old_price: Decimal | None = None
if product.price_histories:
latest = max(product.price_histories, key=lambda ph: ph.observed_date)
old_price = latest.regular_price
if old_price is not None and new_price is not None and new_price < old_price:
# Price actually dropped — less likely to be shrinkflation
confidence = max(Decimal("0.30"), confidence - Decimal("0.30"))
event = ShrinkflationEvent(
id=uuid.uuid4(),
normalized_product_id=product.id,
detected_date=detected_date or date.today(),
old_size=product.size,
new_size=new_size,
old_unit=old_unit,
new_unit=new_unit,
price_at_old_size=old_price,
price_at_new_size=new_price,
confidence=confidence,
notes=(
f"Size decreased {size_change_pct}% ({product.size} {old_unit}{new_size} {new_unit})"
),
)
session.add(event)
session.flush()
return event