forked from cartsnitch/cartsnitch
Squashed 'receiptwitness/' content from commit e8d374a
git-subtree-dir: receiptwitness git-subtree-split: e8d374a89ed8978f429598e02d31b1c5963efe22
This commit is contained in:
@@ -0,0 +1,23 @@
|
||||
"""Shared test fixtures for pipeline tests."""
|
||||
|
||||
import pytest
|
||||
from cartsnitch_common.models.base import Base
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def engine():
|
||||
"""In-memory SQLite engine for unit tests."""
|
||||
eng = create_engine("sqlite:///:memory:")
|
||||
Base.metadata.create_all(eng)
|
||||
yield eng
|
||||
eng.dispose()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def session(engine):
|
||||
"""SQLAlchemy session bound to in-memory SQLite."""
|
||||
factory = sessionmaker(bind=engine)
|
||||
with factory() as sess:
|
||||
yield sess
|
||||
@@ -0,0 +1,161 @@
|
||||
"""Tests for product matching & dedup pipeline."""
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
from decimal import Decimal
|
||||
|
||||
from cartsnitch_common.constants import MatchConfidence
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
from cartsnitch_common.schemas.purchase import PurchaseItemCreate
|
||||
|
||||
from receiptwitness.pipeline.matching import (
|
||||
ProductMatcher,
|
||||
classify_confidence,
|
||||
match_purchase_item,
|
||||
)
|
||||
from receiptwitness.pipeline.normalization import MatchMethod
|
||||
|
||||
|
||||
class TestClassifyConfidence:
|
||||
def test_upc_always_high(self):
|
||||
assert classify_confidence(1.0, MatchMethod.UPC) == MatchConfidence.HIGH
|
||||
assert classify_confidence(0.5, MatchMethod.UPC) == MatchConfidence.HIGH
|
||||
|
||||
def test_name_high(self):
|
||||
assert classify_confidence(0.9, MatchMethod.NAME) == MatchConfidence.HIGH
|
||||
assert classify_confidence(0.8, MatchMethod.NAME) == MatchConfidence.HIGH
|
||||
|
||||
def test_name_medium(self):
|
||||
assert classify_confidence(0.6, MatchMethod.NAME) == MatchConfidence.MEDIUM
|
||||
assert classify_confidence(0.5, MatchMethod.NAME) == MatchConfidence.MEDIUM
|
||||
|
||||
def test_name_low(self):
|
||||
assert classify_confidence(0.3, MatchMethod.NAME) == MatchConfidence.LOW
|
||||
assert classify_confidence(0.0, MatchMethod.NAME) == MatchConfidence.LOW
|
||||
|
||||
|
||||
class TestProductMatcher:
|
||||
def _make_item(self, name: str, upc: str | None = None) -> PurchaseItemCreate:
|
||||
return PurchaseItemCreate(
|
||||
product_name_raw=name,
|
||||
upc=upc,
|
||||
unit_price=Decimal("3.99"),
|
||||
extended_price=Decimal("3.99"),
|
||||
)
|
||||
|
||||
def test_match_by_upc(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk Gallon",
|
||||
upc_variants=["041250000001"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
matcher = ProductMatcher(session)
|
||||
item = self._make_item("Kroger Milk", upc="041250000001")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is not None
|
||||
assert prod.id == product.id
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.UPC
|
||||
assert confidence == MatchConfidence.HIGH
|
||||
|
||||
def test_match_by_name(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk Gallon",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
matcher = ProductMatcher(session, name_threshold=0.3)
|
||||
item = self._make_item("Whole Milk Gallon Size")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is not None
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.NAME
|
||||
|
||||
def test_auto_create_when_no_match(self, session):
|
||||
matcher = ProductMatcher(session, auto_create=True)
|
||||
item = self._make_item("Unique Product XYZ 16 oz")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is not None
|
||||
assert result is None # No match found, was created
|
||||
assert confidence == MatchConfidence.LOW
|
||||
assert prod.canonical_name == "Unique Product XYZ 16 oz"
|
||||
assert prod.size == "16"
|
||||
assert prod.size_unit == "oz"
|
||||
|
||||
def test_no_create_when_disabled(self, session):
|
||||
matcher = ProductMatcher(session, auto_create=False)
|
||||
item = self._make_item("Nonexistent Product")
|
||||
prod, result, confidence = matcher.match_single(item)
|
||||
|
||||
assert prod is None
|
||||
assert result is None
|
||||
|
||||
def test_batch_match(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Large Eggs 12 Count",
|
||||
upc_variants=["012345"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
matcher = ProductMatcher(session)
|
||||
items = [
|
||||
self._make_item("Large Eggs", upc="012345"),
|
||||
self._make_item("Brand New Never Seen Product"),
|
||||
]
|
||||
outcomes = matcher.match_items(items)
|
||||
|
||||
assert len(outcomes) == 2
|
||||
assert outcomes[0].match is not None
|
||||
assert outcomes[0].confidence_level == MatchConfidence.HIGH
|
||||
assert outcomes[0].created_new is False
|
||||
assert outcomes[1].match is None
|
||||
assert outcomes[1].created_new is True
|
||||
|
||||
|
||||
class TestMatchPurchaseItem:
|
||||
def test_convenience_function(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Ground Beef 80/20",
|
||||
upc_variants=["999888"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
|
||||
item = PurchaseItemCreate(
|
||||
product_name_raw="Ground Beef",
|
||||
upc="999888",
|
||||
unit_price=Decimal("5.99"),
|
||||
extended_price=Decimal("5.99"),
|
||||
)
|
||||
prod, confidence = match_purchase_item(session, item)
|
||||
assert prod is not None
|
||||
assert confidence == MatchConfidence.HIGH
|
||||
|
||||
def test_auto_create_default(self, session):
|
||||
item = PurchaseItemCreate(
|
||||
product_name_raw="Totally New Item",
|
||||
unit_price=Decimal("1.00"),
|
||||
extended_price=Decimal("1.00"),
|
||||
)
|
||||
prod, confidence = match_purchase_item(session, item)
|
||||
assert prod is not None
|
||||
assert confidence == MatchConfidence.LOW
|
||||
@@ -0,0 +1,158 @@
|
||||
"""Tests for product normalization module."""
|
||||
|
||||
import uuid
|
||||
from datetime import UTC, datetime
|
||||
|
||||
from cartsnitch_common.models.product import NormalizedProduct
|
||||
|
||||
from receiptwitness.pipeline.normalization import (
|
||||
MatchMethod,
|
||||
clean_name,
|
||||
extract_size_info,
|
||||
jaccard_similarity,
|
||||
match_by_name,
|
||||
match_by_upc,
|
||||
normalize_product,
|
||||
)
|
||||
|
||||
|
||||
class TestCleanName:
|
||||
def test_lowercase(self):
|
||||
assert clean_name("Kroger WHOLE MILK") == "kroger whole milk"
|
||||
|
||||
def test_removes_size_info(self):
|
||||
assert "oz" not in clean_name("Milk 16 oz Whole")
|
||||
|
||||
def test_removes_noise_words(self):
|
||||
cleaned = clean_name("The Original Brand Milk")
|
||||
assert "the" not in cleaned.split()
|
||||
assert "original" not in cleaned.split()
|
||||
assert "brand" not in cleaned.split()
|
||||
|
||||
def test_collapses_whitespace(self):
|
||||
assert " " not in clean_name("Milk Whole Gallon")
|
||||
|
||||
def test_removes_punctuation(self):
|
||||
cleaned = clean_name("Meijer's Best (Organic) Milk!")
|
||||
assert "'" not in cleaned
|
||||
assert "(" not in cleaned
|
||||
|
||||
|
||||
class TestExtractSizeInfo:
|
||||
def test_extracts_oz(self):
|
||||
result = extract_size_info("Cereal 18 oz box")
|
||||
assert result == ("18", "oz")
|
||||
|
||||
def test_extracts_fl_oz(self):
|
||||
result = extract_size_info("Juice 64 fl oz")
|
||||
assert result == ("64", "fl_oz")
|
||||
|
||||
def test_extracts_lb(self):
|
||||
result = extract_size_info("Ground Beef 1.5 lb")
|
||||
assert result == ("1.5", "lb")
|
||||
|
||||
def test_extracts_ct(self):
|
||||
result = extract_size_info("Eggs Large 12 ct")
|
||||
assert result == ("12", "ct")
|
||||
|
||||
def test_no_size_returns_none(self):
|
||||
assert extract_size_info("Bananas") is None
|
||||
|
||||
|
||||
class TestJaccardSimilarity:
|
||||
def test_identical_strings(self):
|
||||
assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0
|
||||
|
||||
def test_completely_different(self):
|
||||
assert jaccard_similarity("apple juice", "ground beef") == 0.0
|
||||
|
||||
def test_partial_overlap(self):
|
||||
score = jaccard_similarity("kroger whole milk", "meijer whole milk")
|
||||
assert 0.4 < score < 0.8 # "whole" and "milk" overlap
|
||||
|
||||
def test_empty_strings(self):
|
||||
assert jaccard_similarity("", "") == 0.0
|
||||
assert jaccard_similarity("milk", "") == 0.0
|
||||
|
||||
|
||||
class TestMatchByUPC:
|
||||
def test_match_found(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk, Gallon",
|
||||
upc_variants=["0041250000001", "0041250000002"],
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
# SQLite doesn't support JSONB containment — this will raise
|
||||
# In production (PostgreSQL), this would work
|
||||
result = match_by_upc(session, "0041250000001")
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.UPC
|
||||
assert result.confidence == 1.0
|
||||
|
||||
def test_no_match(self, session):
|
||||
result = match_by_upc(session, "9999999999999")
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestMatchByName:
|
||||
def test_exact_name_match(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Whole Milk, Gallon",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = match_by_name(session, "Whole Milk Gallon")
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.NAME
|
||||
assert result.confidence > 0.5
|
||||
|
||||
def test_fuzzy_match(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Kroger Whole Milk, 1 Gallon",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3)
|
||||
assert result is not None
|
||||
assert result.confidence > 0.3
|
||||
|
||||
def test_no_match_below_threshold(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Ground Beef 80/20",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5)
|
||||
assert result is None
|
||||
|
||||
|
||||
class TestNormalizeProduct:
|
||||
def test_name_fallback(self, session):
|
||||
product = NormalizedProduct(
|
||||
id=uuid.uuid4(),
|
||||
canonical_name="Large Eggs, 12 count",
|
||||
created_at=datetime.now(UTC),
|
||||
updated_at=datetime.now(UTC),
|
||||
)
|
||||
session.add(product)
|
||||
session.commit()
|
||||
result = normalize_product(session, "Large Eggs 12 ct", upc=None)
|
||||
assert result is not None
|
||||
assert result.method == MatchMethod.NAME
|
||||
|
||||
def test_no_match(self, session):
|
||||
result = normalize_product(session, "Nonexistent Product XYZ", upc=None)
|
||||
assert result is None
|
||||
@@ -0,0 +1,204 @@
|
||||
"""Tests for receipt normalization pipeline."""
|
||||
|
||||
import uuid
|
||||
from datetime import date
|
||||
from decimal import Decimal
|
||||
|
||||
from receiptwitness.pipeline.receipt import (
|
||||
_clean_product_name,
|
||||
_safe_decimal,
|
||||
normalize_receipt,
|
||||
parse_meijer_item,
|
||||
)
|
||||
|
||||
|
||||
class TestCleanProductName:
|
||||
def test_strips_whitespace(self):
|
||||
assert _clean_product_name(" Milk ") == "Milk"
|
||||
|
||||
def test_removes_leading_punctuation(self):
|
||||
assert _clean_product_name("---Milk---") == "Milk"
|
||||
|
||||
def test_collapses_internal_whitespace(self):
|
||||
assert _clean_product_name("Whole Milk Gallon") == "Whole Milk Gallon"
|
||||
|
||||
def test_empty_string(self):
|
||||
assert _clean_product_name("") == ""
|
||||
|
||||
|
||||
class TestSafeDecimal:
|
||||
def test_string_input(self):
|
||||
assert _safe_decimal("3.99") == Decimal("3.99")
|
||||
|
||||
def test_float_input(self):
|
||||
assert _safe_decimal(3.99) == Decimal("3.99")
|
||||
|
||||
def test_int_input(self):
|
||||
assert _safe_decimal(4) == Decimal("4")
|
||||
|
||||
def test_none_returns_default(self):
|
||||
assert _safe_decimal(None) == Decimal("0")
|
||||
|
||||
def test_none_custom_default(self):
|
||||
assert _safe_decimal(None, Decimal("1")) == Decimal("1")
|
||||
|
||||
def test_invalid_returns_default(self):
|
||||
assert _safe_decimal("not-a-number") == Decimal("0")
|
||||
|
||||
def test_decimal_passthrough(self):
|
||||
assert _safe_decimal(Decimal("5.50")) == Decimal("5.50")
|
||||
|
||||
|
||||
class TestParseMeijerItem:
|
||||
def test_basic_item(self):
|
||||
raw = {
|
||||
"description": "Kroger Whole Milk 1 Gallon",
|
||||
"upc": "0041250000001",
|
||||
"quantity": 1,
|
||||
"unitPrice": "3.99",
|
||||
"extendedPrice": "3.99",
|
||||
"category": "DAIRY",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.product_name_raw == "Kroger Whole Milk 1 Gallon"
|
||||
assert item.upc == "41250000001" # leading zeros stripped
|
||||
assert item.quantity == Decimal("1")
|
||||
assert item.unit_price == Decimal("3.99")
|
||||
assert item.extended_price == Decimal("3.99")
|
||||
assert item.category_raw == "DAIRY"
|
||||
|
||||
def test_alternate_field_names(self):
|
||||
raw = {
|
||||
"name": "Eggs Large 12 ct",
|
||||
"upcCode": "012345",
|
||||
"qty": 2,
|
||||
"price": "4.50",
|
||||
"totalPrice": "9.00",
|
||||
"department": "EGGS",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.product_name_raw == "Eggs Large 12 ct"
|
||||
assert item.upc == "12345"
|
||||
assert item.quantity == Decimal("2")
|
||||
assert item.unit_price == Decimal("4.50")
|
||||
assert item.extended_price == Decimal("9.00")
|
||||
assert item.category_raw == "EGGS"
|
||||
|
||||
def test_calculates_extended_from_unit_price(self):
|
||||
raw = {
|
||||
"description": "Bananas",
|
||||
"unitPrice": "0.59",
|
||||
"quantity": 3,
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.extended_price == Decimal("1.77")
|
||||
|
||||
def test_discounts_parsed(self):
|
||||
raw = {
|
||||
"description": "Cereal",
|
||||
"unitPrice": "4.99",
|
||||
"extendedPrice": "4.99",
|
||||
"regularPrice": "5.99",
|
||||
"salePrice": "4.99",
|
||||
"couponAmount": "1.00",
|
||||
"loyaltyAmount": "0.50",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.regular_price == Decimal("5.99")
|
||||
assert item.sale_price == Decimal("4.99")
|
||||
assert item.coupon_discount == Decimal("1.00")
|
||||
assert item.loyalty_discount == Decimal("0.50")
|
||||
|
||||
def test_alternate_discount_names(self):
|
||||
raw = {
|
||||
"description": "Bread",
|
||||
"unitPrice": "2.99",
|
||||
"extendedPrice": "2.99",
|
||||
"couponDiscount": "0.75",
|
||||
"loyaltyDiscount": "0.25",
|
||||
}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.coupon_discount == Decimal("0.75")
|
||||
assert item.loyalty_discount == Decimal("0.25")
|
||||
|
||||
def test_missing_fields_default_gracefully(self):
|
||||
raw = {"description": "Mystery Item"}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.product_name_raw == "Mystery Item"
|
||||
assert item.upc is None
|
||||
assert item.quantity == Decimal("1")
|
||||
assert item.unit_price == Decimal("0")
|
||||
assert item.regular_price is None
|
||||
assert item.category_raw is None
|
||||
|
||||
def test_no_upc_returns_none(self):
|
||||
raw = {"description": "Loose Bananas", "unitPrice": "1.00", "extendedPrice": "1.00"}
|
||||
item = parse_meijer_item(raw)
|
||||
assert item.upc is None
|
||||
|
||||
|
||||
class TestNormalizeReceipt:
|
||||
def test_full_receipt(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {
|
||||
"receiptId": "REC-001",
|
||||
"date": "2026-03-15",
|
||||
"total": "25.47",
|
||||
"subtotal": "23.00",
|
||||
"tax": "2.47",
|
||||
"savings": "3.00",
|
||||
"items": [
|
||||
{"description": "Milk", "unitPrice": "3.99", "extendedPrice": "3.99"},
|
||||
{"description": "Bread", "unitPrice": "2.50", "extendedPrice": "2.50"},
|
||||
],
|
||||
}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.receipt_id == "REC-001"
|
||||
assert purchase.purchase_date == date(2026, 3, 15)
|
||||
assert purchase.total == Decimal("25.47")
|
||||
assert purchase.subtotal == Decimal("23.00")
|
||||
assert purchase.tax == Decimal("2.47")
|
||||
assert purchase.savings_total == Decimal("3.00")
|
||||
assert len(purchase.items) == 2
|
||||
assert purchase.items[0].product_name_raw == "Milk"
|
||||
assert purchase.raw_data == raw
|
||||
|
||||
def test_alternate_receipt_fields(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {
|
||||
"receipt_id": "REC-002",
|
||||
"purchaseDate": "2026-03-14",
|
||||
"totalAmount": "10.00",
|
||||
"taxAmount": "0.75",
|
||||
"totalSavings": "1.50",
|
||||
"items": [],
|
||||
}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.receipt_id == "REC-002"
|
||||
assert purchase.purchase_date == date(2026, 3, 14)
|
||||
assert purchase.total == Decimal("10.00")
|
||||
assert purchase.tax == Decimal("0.75")
|
||||
assert purchase.savings_total == Decimal("1.50")
|
||||
|
||||
def test_missing_date_defaults_to_today(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {"total": "5.00", "items": []}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.purchase_date == date.today()
|
||||
|
||||
def test_generates_receipt_id_if_missing(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {"total": "5.00", "date": "2026-03-15", "items": []}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.receipt_id # Should be a generated UUID string
|
||||
|
||||
def test_date_object_passthrough(self):
|
||||
user_id = str(uuid.uuid4())
|
||||
store_id = str(uuid.uuid4())
|
||||
raw = {"date": date(2026, 1, 1), "total": "5.00", "items": []}
|
||||
purchase = normalize_receipt(raw, user_id, store_id)
|
||||
assert purchase.purchase_date == date(2026, 1, 1)
|
||||
Reference in New Issue
Block a user