forked from cartsnitch/cartsnitch
137 lines
4.2 KiB
Python
137 lines
4.2 KiB
Python
"""Product matching & dedup — UPC primary, fuzzy name fallback, confidence scoring.
|
|
|
|
Wraps the Phase 1 normalization module with confidence-level classification
|
|
and batch matching for purchase ingestion.
|
|
"""
|
|
|
|
import uuid
|
|
from dataclasses import dataclass
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from cartsnitch_common.constants import MatchConfidence
|
|
from cartsnitch_common.models.product import NormalizedProduct
|
|
from cartsnitch_common.normalization import (
|
|
MatchMethod,
|
|
MatchResult,
|
|
extract_size_info,
|
|
normalize_product,
|
|
)
|
|
from cartsnitch_common.schemas.purchase import PurchaseItemCreate
|
|
|
|
# Re-export for convenience
|
|
ConfidenceLevel = MatchConfidence
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class MatchOutcome:
|
|
"""Result of matching a single purchase item to a normalized product."""
|
|
|
|
item_index: int
|
|
match: MatchResult | None
|
|
confidence_level: MatchConfidence
|
|
created_new: bool = False
|
|
|
|
|
|
def classify_confidence(score: float, method: MatchMethod) -> MatchConfidence:
|
|
"""Classify a match score into high/medium/low confidence."""
|
|
if method == MatchMethod.UPC:
|
|
return MatchConfidence.HIGH
|
|
# Name-based matching thresholds
|
|
if score >= 0.8:
|
|
return MatchConfidence.HIGH
|
|
if score >= 0.5:
|
|
return MatchConfidence.MEDIUM
|
|
return MatchConfidence.LOW
|
|
|
|
|
|
def _create_product_from_item(
|
|
session: Session,
|
|
item: PurchaseItemCreate,
|
|
) -> NormalizedProduct:
|
|
"""Create a new NormalizedProduct from a purchase item that had no match."""
|
|
size_info = extract_size_info(item.product_name_raw)
|
|
product = NormalizedProduct(
|
|
id=uuid.uuid4(),
|
|
canonical_name=item.product_name_raw,
|
|
size=size_info[0] if size_info else None,
|
|
size_unit=size_info[1] if size_info else None,
|
|
upc_variants=[item.upc] if item.upc else [],
|
|
)
|
|
session.add(product)
|
|
session.flush()
|
|
return product
|
|
|
|
|
|
class ProductMatcher:
|
|
"""Batch product matcher for purchase ingestion.
|
|
|
|
Usage:
|
|
matcher = ProductMatcher(session)
|
|
outcomes = matcher.match_items(items)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
session: Session,
|
|
name_threshold: float = 0.4,
|
|
auto_create: bool = True,
|
|
):
|
|
self.session = session
|
|
self.name_threshold = name_threshold
|
|
self.auto_create = auto_create
|
|
|
|
def match_single(
|
|
self,
|
|
item: PurchaseItemCreate,
|
|
) -> tuple[NormalizedProduct | None, MatchResult | None, MatchConfidence]:
|
|
"""Match a single purchase item to a normalized product.
|
|
|
|
Returns (product, match_result, confidence_level).
|
|
If auto_create is True and no match found, creates a new product.
|
|
"""
|
|
result = normalize_product(
|
|
self.session,
|
|
item.product_name_raw,
|
|
upc=item.upc,
|
|
name_threshold=self.name_threshold,
|
|
)
|
|
|
|
if result:
|
|
confidence = classify_confidence(result.confidence, result.method)
|
|
return result.product, result, confidence
|
|
|
|
if self.auto_create:
|
|
product = _create_product_from_item(self.session, item)
|
|
return product, None, MatchConfidence.LOW
|
|
|
|
return None, None, MatchConfidence.LOW
|
|
|
|
def match_items(self, items: list[PurchaseItemCreate]) -> list[MatchOutcome]:
|
|
"""Match a batch of purchase items. Returns outcomes in order."""
|
|
outcomes: list[MatchOutcome] = []
|
|
for idx, item in enumerate(items):
|
|
product, result, confidence = self.match_single(item)
|
|
created = result is None and product is not None
|
|
outcomes.append(
|
|
MatchOutcome(
|
|
item_index=idx,
|
|
match=result,
|
|
confidence_level=confidence,
|
|
created_new=created,
|
|
)
|
|
)
|
|
return outcomes
|
|
|
|
|
|
def match_purchase_item(
|
|
session: Session,
|
|
item: PurchaseItemCreate,
|
|
name_threshold: float = 0.4,
|
|
auto_create: bool = True,
|
|
) -> tuple[NormalizedProduct | None, MatchConfidence]:
|
|
"""Convenience function: match a single item, return (product, confidence)."""
|
|
matcher = ProductMatcher(session, name_threshold=name_threshold, auto_create=auto_create)
|
|
product, _, confidence = matcher.match_single(item)
|
|
return product, confidence
|