Squashed 'receiptwitness/' content from commit e8d374a

git-subtree-dir: receiptwitness
git-subtree-split: e8d374a89ed8978f429598e02d31b1c5963efe22
This commit is contained in:
Coupon Carl
2026-03-28 02:24:22 +00:00
commit 342906c9d1
53 changed files with 7300 additions and 0 deletions
+29
View File
@@ -0,0 +1,29 @@
"""Shared test fixtures."""
import json
from pathlib import Path
import pytest
FIXTURES_DIR = Path(__file__).parent / "fixtures"
@pytest.fixture
def meijer_receipt_data() -> dict:
"""Load the sample Meijer receipt fixture."""
with open(FIXTURES_DIR / "meijer_receipt.json") as f:
return json.load(f)
@pytest.fixture
def kroger_receipt_data() -> dict:
"""Load the sample Kroger receipt fixture."""
with open(FIXTURES_DIR / "kroger_receipt.json") as f:
return json.load(f)
@pytest.fixture
def target_receipt_data() -> dict:
"""Load the sample Target receipt fixture."""
with open(FIXTURES_DIR / "target_receipt.json") as f:
return json.load(f)
+131
View File
@@ -0,0 +1,131 @@
{
"orderId": "KR-2026-0312-4471",
"purchaseDate": "2026-03-12T16:45:00Z",
"storeNumber": "00357",
"divisionNumber": "014",
"total": 94.17,
"savings": 15.30,
"detail": {
"receiptId": "KR-2026-0312-4471",
"items": [
{
"description": "KROGER WHOLE MILK GAL",
"upc": "0001111041700",
"quantity": 1,
"basePrice": 3.99,
"totalPrice": 3.99,
"regularPrice": 4.29,
"salePrice": 3.99,
"couponAmount": 0.0,
"plusCardSavings": 0.30,
"department": "DAIRY"
},
{
"description": "BANANAS",
"upc": "0000000004011",
"quantity": 1,
"basePrice": 0.59,
"totalPrice": 0.59,
"regularPrice": 0.59,
"salePrice": null,
"couponAmount": null,
"plusCardSavings": null,
"department": "PRODUCE"
},
{
"description": "SIMPLE TRUTH ORG EGGS 12CT",
"upc": "0001111087840",
"quantity": 2,
"basePrice": 5.49,
"totalPrice": 10.98,
"regularPrice": 5.99,
"salePrice": 5.49,
"couponAmount": 0.0,
"plusCardSavings": 1.00,
"department": "DAIRY"
},
{
"description": "KROGER DELI TURKEY BREAST",
"upc": null,
"quantity": 0.68,
"basePrice": 9.99,
"totalPrice": 6.79,
"regularPrice": 9.99,
"salePrice": null,
"weight": 0.68,
"weightUom": "LB",
"department": "DELI"
},
{
"description": "TIDE PODS 42CT",
"upc": "0003700096223",
"quantity": 1,
"basePrice": 13.99,
"totalPrice": 13.99,
"regularPrice": 15.99,
"salePrice": 13.99,
"couponAmount": 2.00,
"plusCardSavings": 0.0,
"department": "HOUSEHOLD"
},
{
"description": "VOIDED DORITOS NACHO",
"upc": "0002840032505",
"quantity": 1,
"basePrice": 4.79,
"totalPrice": 4.79,
"voided": true,
"department": "SNACKS"
},
{
"description": "RETURNED GATORADE 8PK",
"upc": "0005200012505",
"quantity": 1,
"basePrice": 7.99,
"totalPrice": 7.99,
"status": "RETURNED",
"department": "BEVERAGES"
},
{
"description": "KROGER SHARP CHEDDAR 8OZ",
"upc": "0001111060930",
"quantity": 1,
"basePrice": 3.49,
"totalPrice": 3.49,
"regularPrice": 3.49,
"salePrice": null,
"couponAmount": null,
"plusCardSavings": null,
"department": "DAIRY"
},
{
"description": "PRIVATE SELECTION PASTA",
"upc": "0001111085612",
"quantity": 3,
"basePrice": 2.49,
"totalPrice": 7.47,
"regularPrice": 2.99,
"salePrice": 2.49,
"couponAmount": 0.0,
"plusCardSavings": 1.50,
"department": "GROCERY"
},
{
"description": "KROGER GROUND BEEF 80/20",
"upc": null,
"quantity": 1.23,
"basePrice": 5.99,
"totalPrice": 7.37,
"regularPrice": 6.99,
"salePrice": 5.99,
"weight": 1.23,
"weightUom": "LB",
"department": "MEAT"
}
],
"subtotal": 78.47,
"tax": 5.50,
"total": 94.17,
"totalSavings": 15.30
}
}
+85
View File
@@ -0,0 +1,85 @@
{
"transactionId": "TXN-2026-0310-001",
"transactionDate": "2026-03-10T14:30:00Z",
"storeNumber": "42",
"total": 87.42,
"savings": 12.50,
"detail": {
"receiptId": "TXN-2026-0310-001",
"items": [
{
"description": "ORGANIC BANANAS",
"upc": "0000000004011",
"quantity": 1,
"price": 0.69,
"extendedPrice": 0.69,
"regularPrice": 0.79,
"salePrice": 0.69,
"couponDiscount": 0.0,
"mperksDiscount": 0.10,
"category": "PRODUCE"
},
{
"description": "MEIJER 2% MILK GAL",
"upc": "0041250000123",
"quantity": 2,
"price": 3.49,
"extendedPrice": 6.98,
"regularPrice": 3.79,
"salePrice": 3.49,
"couponDiscount": 0.0,
"mperksDiscount": 0.0,
"category": "DAIRY"
},
{
"description": "CHEERIOS 18OZ",
"upc": "0016000275614",
"quantity": 1,
"price": 4.99,
"extendedPrice": 4.99,
"regularPrice": 5.49,
"salePrice": null,
"couponDiscount": 0.50,
"mperksDiscount": 0.0,
"category": "CEREAL"
},
{
"description": "WEIGHTED DELI TURKEY",
"upc": null,
"quantity": 0.75,
"price": 8.99,
"extendedPrice": 6.74,
"regularPrice": 8.99,
"salePrice": null,
"couponDiscount": null,
"mperksDiscount": null,
"category": "DELI"
},
{
"description": "VOIDED SODA 12PK",
"upc": "0004900005678",
"quantity": 1,
"price": 5.99,
"extendedPrice": 5.99,
"voided": true,
"category": "BEVERAGES"
},
{
"description": "MEIJER PAPER TOWELS 6PK",
"upc": "0041250099001",
"quantity": 1,
"price": 7.99,
"extendedPrice": 7.99,
"regularPrice": 9.99,
"salePrice": 7.99,
"couponDiscount": 1.00,
"mperksDiscount": 1.00,
"category": "HOUSEHOLD"
}
],
"subtotal": 74.92,
"tax": 5.24,
"total": 87.42,
"totalSavings": 12.50
}
}
+140
View File
@@ -0,0 +1,140 @@
{
"orderId": "TGT-2026-0315-7890",
"purchaseDate": "2026-03-15T11:23:00Z",
"storeNumber": "2774",
"total": 83.21,
"savings": 11.45,
"detail": {
"receiptId": "TGT-2026-0315-7890",
"items": [
{
"description": "GOOD & GATHER WHOLE MILK GAL",
"tcin": "14767459",
"upc": "0085239100123",
"quantity": 1,
"unitPrice": 3.89,
"totalPrice": 3.89,
"regularPrice": 4.19,
"circlePrice": 3.89,
"couponDiscount": 0.0,
"circleRewardsDiscount": 0.30,
"promoDescription": "Circle offer: Save 30c",
"department": "GROCERY"
},
{
"description": "BANANAS",
"upc": "0000000004011",
"quantity": 1,
"unitPrice": 0.25,
"totalPrice": 0.25,
"regularPrice": 0.25,
"circlePrice": null,
"couponDiscount": null,
"circleRewardsDiscount": null,
"department": "PRODUCE"
},
{
"description": "MARKET PANTRY LARGE EGGS 18CT",
"tcin": "13292174",
"upc": "0085239206753",
"quantity": 2,
"unitPrice": 4.99,
"totalPrice": 9.98,
"regularPrice": 5.49,
"circlePrice": 4.99,
"couponDiscount": 0.0,
"circleRewardsDiscount": 1.00,
"promoDescription": "Circle offer: 2 for $10",
"department": "GROCERY"
},
{
"description": "DELI SLICED TURKEY BREAST",
"upc": null,
"quantity": 0.72,
"unitPrice": 10.99,
"totalPrice": 7.91,
"regularPrice": 10.99,
"weight": 0.72,
"weightUom": "LB",
"department": "DELI"
},
{
"description": "TIDE PODS 42CT",
"tcin": "76150253",
"upc": "0003700096223",
"quantity": 1,
"unitPrice": 13.49,
"totalPrice": 13.49,
"regularPrice": 15.99,
"circlePrice": 13.49,
"couponDiscount": 2.50,
"circleRewardsDiscount": 0.0,
"promoDescription": "Circle offer + mfr coupon",
"department": "HOUSEHOLD"
},
{
"description": "UP&UP PAPER TOWELS 6PK",
"tcin": "52493117",
"upc": "0085239401567",
"quantity": 1,
"unitPrice": 8.99,
"totalPrice": 8.99,
"regularPrice": 8.99,
"circlePrice": null,
"couponDiscount": null,
"circleRewardsDiscount": null,
"department": "HOUSEHOLD"
},
{
"description": "VOIDED COCA-COLA 12PK",
"upc": "0004900002521",
"quantity": 1,
"unitPrice": 7.49,
"totalPrice": 7.49,
"voided": true,
"department": "BEVERAGES"
},
{
"description": "RETURNED OLAY MOISTURIZER",
"upc": "0007560402118",
"quantity": 1,
"unitPrice": 12.99,
"totalPrice": 12.99,
"status": "RETURNED",
"department": "BEAUTY"
},
{
"description": "FAVOURITE DAY TRAIL MIX",
"tcin": "83921045",
"dpci": "271-09-0142",
"upc": "0085239700891",
"quantity": 1,
"unitPrice": 5.49,
"totalPrice": 5.49,
"regularPrice": 5.49,
"circlePrice": null,
"couponDiscount": null,
"circleRewardsDiscount": null,
"department": "SNACKS"
},
{
"description": "BOGO GOOD & GATHER PASTA",
"tcin": "78114326",
"upc": "0085239300456",
"quantity": 2,
"unitPrice": 1.79,
"totalPrice": 1.79,
"regularPrice": 1.79,
"circlePrice": 0.895,
"couponDiscount": 0.0,
"circleRewardsDiscount": 1.79,
"promoDescription": "Buy 1 get 1 free",
"department": "GROCERY"
}
],
"subtotal": 78.32,
"tax": 4.89,
"total": 83.21,
"totalSavings": 11.45
}
}
View File
+399
View File
@@ -0,0 +1,399 @@
"""Tests for the Kroger receipt parser."""
from decimal import Decimal
from receiptwitness.parsers.kroger import _parse_item, _to_decimal, parse_kroger_receipt
from receiptwitness.scrapers.base import RawReceipt
class TestToDecimal:
def test_from_int(self):
assert _to_decimal(42) == Decimal("42")
def test_from_float(self):
assert _to_decimal(3.99) == Decimal("3.99")
def test_from_string(self):
assert _to_decimal("7.49") == Decimal("7.49")
def test_none_returns_default(self):
assert _to_decimal(None) == Decimal("0")
def test_none_custom_default(self):
assert _to_decimal(None, "1") == Decimal("1")
def test_invalid_string_returns_default(self):
assert _to_decimal("not-a-number") == Decimal("0")
def test_empty_string_returns_default(self):
assert _to_decimal("") == Decimal("0")
class TestParseItem:
def test_standard_item(self):
raw = {
"description": "KROGER WHOLE MILK GAL",
"upc": "0001111041700",
"quantity": 1,
"basePrice": 3.99,
"totalPrice": 3.99,
"regularPrice": 4.29,
"salePrice": 3.99,
"couponAmount": 0.0,
"plusCardSavings": 0.30,
"department": "DAIRY",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "KROGER WHOLE MILK GAL"
assert result["upc"] == "1111041700"
assert result["quantity"] == Decimal("1")
assert result["unit_price"] == Decimal("3.99")
assert result["extended_price"] == Decimal("3.99")
assert result["regular_price"] == Decimal("4.29")
assert result["sale_price"] == Decimal("3.99")
assert result["loyalty_discount"] == Decimal("0.30")
assert result["category_raw"] == "DAIRY"
def test_weighted_item(self):
raw = {
"description": "KROGER DELI TURKEY BREAST",
"quantity": 0.68,
"basePrice": 9.99,
"totalPrice": 6.79,
"weight": 0.68,
"weightUom": "LB",
"department": "DELI",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "KROGER DELI TURKEY BREAST"
assert result["upc"] is None
assert result["quantity"] == Decimal("0.68")
assert result["unit_price"] == Decimal("9.99")
assert result["extended_price"] == Decimal("6.79")
def test_missing_extended_price_computed(self):
raw = {
"description": "TEST ITEM",
"quantity": 3,
"basePrice": 2.49,
}
result = _parse_item(raw)
assert result["extended_price"] == Decimal("2.49") * Decimal("3")
def test_item_with_coupon(self):
raw = {
"description": "TIDE PODS 42CT",
"upc": "0003700096223",
"quantity": 1,
"basePrice": 13.99,
"totalPrice": 13.99,
"couponAmount": 2.00,
}
result = _parse_item(raw)
assert result["coupon_discount"] == Decimal("2.00")
def test_missing_description_fallback(self):
raw = {"basePrice": 1.00, "totalPrice": 1.00}
result = _parse_item(raw)
assert result["product_name_raw"] == "UNKNOWN ITEM"
def test_alternative_field_names_product_name(self):
raw = {
"productName": "ALT NAME ITEM",
"unitPrice": 5.00,
"extendedAmount": 5.00,
"qty": 1,
"krogerProductId": "123456789",
"category": "GROCERY",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "ALT NAME ITEM"
assert result["unit_price"] == Decimal("5.00")
assert result["extended_price"] == Decimal("5.00")
assert result["upc"] == "123456789"
assert result["category_raw"] == "GROCERY"
def test_item_description_field_name(self):
raw = {
"itemDescription": "ITEM DESC FIELD",
"price": 3.00,
"lineTotal": 3.00,
}
result = _parse_item(raw)
assert result["product_name_raw"] == "ITEM DESC FIELD"
assert result["unit_price"] == Decimal("3.00")
assert result["extended_price"] == Decimal("3.00")
def test_null_optional_fields(self):
raw = {
"description": "BANANAS",
"upc": "0000000004011",
"quantity": 1,
"basePrice": 0.59,
"totalPrice": 0.59,
"salePrice": None,
"couponAmount": None,
"plusCardSavings": None,
}
result = _parse_item(raw)
assert result["sale_price"] is None
assert result["coupon_discount"] is None
assert result["loyalty_discount"] is None
def test_upc_leading_zeros_stripped(self):
raw = {
"description": "TEST",
"upc": "0000000004011",
"basePrice": 1.00,
"totalPrice": 1.00,
}
result = _parse_item(raw)
assert result["upc"] == "4011"
def test_upc_from_kroger_product_id(self):
raw = {
"description": "TEST",
"krogerProductId": "987654321",
"basePrice": 1.00,
"totalPrice": 1.00,
}
result = _parse_item(raw)
assert result["upc"] == "987654321"
def test_description_whitespace_stripped(self):
raw = {
"description": " EXTRA SPACES ",
"basePrice": 1.00,
"totalPrice": 1.00,
}
result = _parse_item(raw)
assert result["product_name_raw"] == "EXTRA SPACES"
def test_promo_price_field(self):
raw = {
"description": "PROMO ITEM",
"promoPrice": 2.99,
"originalPrice": 4.99,
"basePrice": 2.99,
"totalPrice": 2.99,
}
result = _parse_item(raw)
assert result["sale_price"] == Decimal("2.99")
assert result["regular_price"] == Decimal("4.99")
def test_loyalty_discount_from_fuel_points(self):
raw = {
"description": "FUEL DISC ITEM",
"fuelPointsDiscount": 0.50,
"basePrice": 3.00,
"totalPrice": 3.00,
}
result = _parse_item(raw)
assert result["loyalty_discount"] == Decimal("0.50")
def test_multi_quantity_item(self):
raw = {
"description": "PRIVATE SELECTION PASTA",
"quantity": 3,
"basePrice": 2.49,
"totalPrice": 7.47,
"department": "GROCERY",
}
result = _parse_item(raw)
assert result["quantity"] == Decimal("3")
assert result["unit_price"] == Decimal("2.49")
assert result["extended_price"] == Decimal("7.47")
def test_aisle_as_category(self):
raw = {
"description": "AISLE ITEM",
"aisle": "FROZEN FOODS",
"basePrice": 4.00,
"totalPrice": 4.00,
}
result = _parse_item(raw)
assert result["category_raw"] == "FROZEN FOODS"
class TestParseKrogerReceipt:
def test_full_receipt(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
store_number="00357",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert result["receipt_id"] == "KR-2026-0312-4471"
assert result["purchase_date"] == "2026-03-12T16:45:00Z"
assert result["total"] == Decimal("94.17")
assert result["subtotal"] == Decimal("78.47")
assert result["tax"] == Decimal("5.50")
assert result["savings_total"] == Decimal("15.30")
# Should have 8 items (voided + returned items excluded)
assert len(result["items"]) == 8
# Verify first item
milk = result["items"][0]
assert milk["product_name_raw"] == "KROGER WHOLE MILK GAL"
assert milk["upc"] == "1111041700"
def test_voided_items_excluded(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
item_names = [i["product_name_raw"] for i in result["items"]]
assert "VOIDED DORITOS NACHO" not in item_names
def test_returned_items_excluded(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
item_names = [i["product_name_raw"] for i in result["items"]]
assert "RETURNED GATORADE 8PK" not in item_names
def test_return_flag_items_excluded(self):
data = {
"detail": {
"items": [
{
"description": "NORMAL ITEM",
"basePrice": 5.00,
"totalPrice": 5.00,
},
{
"description": "RETURNED VIA FLAG",
"basePrice": 3.00,
"totalPrice": 3.00,
"returnFlag": True,
},
{
"description": "IS RETURN ITEM",
"basePrice": 2.00,
"totalPrice": 2.00,
"isReturn": True,
},
],
"total": 5.00,
}
}
raw = RawReceipt(
receipt_id="RET-001",
purchase_date="2026-03-12",
raw_data=data,
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "NORMAL ITEM"
def test_empty_receipt(self):
raw = RawReceipt(
receipt_id="EMPTY-001",
purchase_date="2026-03-12",
raw_data={"detail": {"items": [], "total": 0}},
)
result = parse_kroger_receipt(raw)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_receipt_with_no_detail(self):
raw = RawReceipt(
receipt_id="NO-DETAIL-001",
purchase_date="2026-03-12",
raw_data={"total": 50.00},
)
result = parse_kroger_receipt(raw)
assert result["items"] == []
assert result["total"] == Decimal("50.00")
def test_raw_data_preserved(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert result["raw_data"] is kroger_receipt_data
def test_alternative_total_field_names(self):
raw = RawReceipt(
receipt_id="ALT-001",
purchase_date="2026-03-12",
raw_data={
"orderTotal": 42.00,
"subTotal": 35.00,
"salesTax": 3.50,
"youSaved": 5.00,
"detail": {"items": []},
},
)
result = parse_kroger_receipt(raw)
assert result["total"] == Decimal("42.00")
assert result["subtotal"] == Decimal("35.00")
assert result["tax"] == Decimal("3.50")
assert result["savings_total"] == Decimal("5.00")
def test_receipt_items_alternative_key(self):
data = {
"detail": {
"receiptItems": [
{
"description": "ALT KEY ITEM",
"basePrice": 3.00,
"totalPrice": 3.00,
}
],
"total": 3.00,
}
}
raw = RawReceipt(
receipt_id="ALT-KEY-001",
purchase_date="2026-03-12",
raw_data=data,
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM"
def test_source_url_preserved(self):
raw = RawReceipt(
receipt_id="URL-001",
purchase_date="2026-03-12",
raw_data={"detail": {"items": [], "total": 0}},
source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001",
)
result = parse_kroger_receipt(raw)
assert result["source_url"] == "https://www.kroger.com/atlas/v1/receipt/api?orderId=URL-001"
def test_weighted_items_in_full_receipt(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
# Find the weighted turkey item
turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"])
assert turkey["quantity"] == Decimal("0.68")
assert turkey["unit_price"] == Decimal("9.99")
assert turkey["extended_price"] == Decimal("6.79")
def test_grand_total_field(self):
raw = RawReceipt(
receipt_id="GT-001",
purchase_date="2026-03-12",
raw_data={"grandTotal": 99.99, "detail": {"items": []}},
)
result = parse_kroger_receipt(raw)
assert result["total"] == Decimal("99.99")
+174
View File
@@ -0,0 +1,174 @@
"""Tests for the Meijer receipt parser."""
from decimal import Decimal
from receiptwitness.parsers.meijer import _parse_item, _to_decimal, parse_meijer_receipt
from receiptwitness.scrapers.base import RawReceipt
class TestToDecimal:
def test_from_int(self):
assert _to_decimal(42) == Decimal("42")
def test_from_float(self):
assert _to_decimal(3.49) == Decimal("3.49")
def test_from_string(self):
assert _to_decimal("7.99") == Decimal("7.99")
def test_none_returns_default(self):
assert _to_decimal(None) == Decimal("0")
def test_none_custom_default(self):
assert _to_decimal(None, "1") == Decimal("1")
def test_invalid_string_returns_default(self):
assert _to_decimal("not-a-number") == Decimal("0")
class TestParseItem:
def test_standard_item(self):
raw = {
"description": "ORGANIC BANANAS",
"upc": "0000000004011",
"quantity": 1,
"price": 0.69,
"extendedPrice": 0.69,
"regularPrice": 0.79,
"salePrice": 0.69,
"couponDiscount": 0.0,
"mperksDiscount": 0.10,
"category": "PRODUCE",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "ORGANIC BANANAS"
assert result["upc"] == "4011"
assert result["quantity"] == Decimal("1")
assert result["unit_price"] == Decimal("0.69")
assert result["extended_price"] == Decimal("0.69")
assert result["regular_price"] == Decimal("0.79")
assert result["sale_price"] == Decimal("0.69")
assert result["loyalty_discount"] == Decimal("0.10")
assert result["category_raw"] == "PRODUCE"
def test_weighted_item(self):
raw = {
"description": "WEIGHTED DELI TURKEY",
"quantity": 0.75,
"price": 8.99,
"extendedPrice": 6.74,
"category": "DELI",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "WEIGHTED DELI TURKEY"
assert result["upc"] is None
assert result["quantity"] == Decimal("0.75")
assert result["unit_price"] == Decimal("8.99")
assert result["extended_price"] == Decimal("6.74")
def test_missing_extended_price_computed(self):
raw = {
"description": "TEST ITEM",
"quantity": 3,
"price": 2.50,
}
result = _parse_item(raw)
assert result["extended_price"] == Decimal("2.50") * Decimal("3")
def test_item_with_coupon_discount(self):
raw = {
"description": "CHEERIOS 18OZ",
"upc": "0016000275614",
"quantity": 1,
"price": 4.99,
"extendedPrice": 4.99,
"couponDiscount": 0.50,
}
result = _parse_item(raw)
assert result["coupon_discount"] == Decimal("0.50")
def test_missing_description_fallback(self):
raw = {"price": 1.00, "extendedPrice": 1.00}
result = _parse_item(raw)
assert result["product_name_raw"] == "UNKNOWN ITEM"
def test_alternative_field_names(self):
raw = {
"itemDescription": "ALT NAME ITEM",
"unitPrice": 5.00,
"totalPrice": 5.00,
"qty": 1,
"UPC": "123456789",
"departmentDescription": "GROCERY",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "ALT NAME ITEM"
assert result["unit_price"] == Decimal("5.00")
assert result["upc"] == "123456789"
assert result["category_raw"] == "GROCERY"
class TestParseMeijerReceipt:
def test_full_receipt(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
store_number="42",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert result["receipt_id"] == "TXN-2026-0310-001"
assert result["purchase_date"] == "2026-03-10T14:30:00Z"
assert result["total"] == Decimal("87.42")
assert result["subtotal"] == Decimal("74.92")
assert result["tax"] == Decimal("5.24")
assert result["savings_total"] == Decimal("12.50")
# Should have 5 items (voided item excluded)
assert len(result["items"]) == 5
# Verify first item
bananas = result["items"][0]
assert bananas["product_name_raw"] == "ORGANIC BANANAS"
assert bananas["upc"] == "4011"
def test_voided_items_excluded(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
item_names = [i["product_name_raw"] for i in result["items"]]
assert "VOIDED SODA 12PK" not in item_names
def test_empty_receipt(self):
raw = RawReceipt(
receipt_id="EMPTY-001",
purchase_date="2026-03-10",
raw_data={"detail": {"items": [], "total": 0}},
)
result = parse_meijer_receipt(raw)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_receipt_with_no_detail(self):
raw = RawReceipt(
receipt_id="NO-DETAIL-001",
purchase_date="2026-03-10",
raw_data={"total": 50.00},
)
result = parse_meijer_receipt(raw)
assert result["items"] == []
assert result["total"] == Decimal("50.00")
def test_raw_data_preserved(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert result["raw_data"] is meijer_receipt_data
+471
View File
@@ -0,0 +1,471 @@
"""Tests for the Target receipt parser."""
from decimal import Decimal
from receiptwitness.parsers.target import _parse_item, _to_decimal, parse_target_receipt
from receiptwitness.scrapers.base import RawReceipt
class TestToDecimal:
def test_from_int(self):
assert _to_decimal(42) == Decimal("42")
def test_from_float(self):
assert _to_decimal(3.89) == Decimal("3.89")
def test_from_string(self):
assert _to_decimal("8.99") == Decimal("8.99")
def test_none_returns_default(self):
assert _to_decimal(None) == Decimal("0")
def test_none_custom_default(self):
assert _to_decimal(None, "1") == Decimal("1")
def test_invalid_string_returns_default(self):
assert _to_decimal("not-a-number") == Decimal("0")
def test_empty_string_returns_default(self):
assert _to_decimal("") == Decimal("0")
class TestParseItem:
def test_standard_item(self):
raw = {
"description": "GOOD & GATHER WHOLE MILK GAL",
"tcin": "14767459",
"upc": "0085239100123",
"quantity": 1,
"unitPrice": 3.89,
"totalPrice": 3.89,
"regularPrice": 4.19,
"circlePrice": 3.89,
"couponDiscount": 0.0,
"circleRewardsDiscount": 0.30,
"department": "GROCERY",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL"
assert result["upc"] == "85239100123"
assert result["quantity"] == Decimal("1")
assert result["unit_price"] == Decimal("3.89")
assert result["extended_price"] == Decimal("3.89")
assert result["regular_price"] == Decimal("4.19")
assert result["sale_price"] == Decimal("3.89")
assert result["loyalty_discount"] == Decimal("0.30")
assert result["category_raw"] == "GROCERY"
def test_weighted_item(self):
raw = {
"description": "DELI SLICED TURKEY BREAST",
"quantity": 0.72,
"unitPrice": 10.99,
"totalPrice": 7.91,
"weight": 0.72,
"weightUom": "LB",
"department": "DELI",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "DELI SLICED TURKEY BREAST"
assert result["upc"] is None
assert result["quantity"] == Decimal("0.72")
assert result["unit_price"] == Decimal("10.99")
assert result["extended_price"] == Decimal("7.91")
def test_missing_extended_price_computed(self):
raw = {
"description": "TEST ITEM",
"quantity": 3,
"unitPrice": 2.49,
}
result = _parse_item(raw)
assert result["extended_price"] == Decimal("2.49") * Decimal("3")
def test_item_with_coupon(self):
raw = {
"description": "TIDE PODS 42CT",
"upc": "0003700096223",
"quantity": 1,
"unitPrice": 13.49,
"totalPrice": 13.49,
"couponDiscount": 2.50,
}
result = _parse_item(raw)
assert result["coupon_discount"] == Decimal("2.50")
def test_missing_description_fallback(self):
raw = {"unitPrice": 1.00, "totalPrice": 1.00}
result = _parse_item(raw)
assert result["product_name_raw"] == "UNKNOWN ITEM"
def test_alternative_field_names(self):
raw = {
"productName": "ALT NAME ITEM",
"price": 5.00,
"extendedPrice": 5.00,
"qty": 1,
"UPC": "123456789",
"category": "FROZEN",
}
result = _parse_item(raw)
assert result["product_name_raw"] == "ALT NAME ITEM"
assert result["unit_price"] == Decimal("5.00")
assert result["extended_price"] == Decimal("5.00")
assert result["upc"] == "123456789"
assert result["category_raw"] == "FROZEN"
def test_item_description_field_name(self):
raw = {
"itemDescription": "ITEM DESC FIELD",
"price": 3.00,
"lineTotal": 3.00,
}
result = _parse_item(raw)
assert result["product_name_raw"] == "ITEM DESC FIELD"
assert result["unit_price"] == Decimal("3.00")
assert result["extended_price"] == Decimal("3.00")
def test_null_optional_fields(self):
raw = {
"description": "BANANAS",
"upc": "0000000004011",
"quantity": 1,
"unitPrice": 0.25,
"totalPrice": 0.25,
"circlePrice": None,
"couponDiscount": None,
"circleRewardsDiscount": None,
}
result = _parse_item(raw)
assert result["sale_price"] is None
assert result["coupon_discount"] is None
assert result["loyalty_discount"] is None
def test_upc_leading_zeros_stripped(self):
raw = {
"description": "TEST",
"upc": "0000000004011",
"unitPrice": 1.00,
"totalPrice": 1.00,
}
result = _parse_item(raw)
assert result["upc"] == "4011"
def test_description_whitespace_stripped(self):
raw = {
"description": " EXTRA SPACES ",
"unitPrice": 1.00,
"totalPrice": 1.00,
}
result = _parse_item(raw)
assert result["product_name_raw"] == "EXTRA SPACES"
def test_circle_price_preferred_over_sale_price(self):
raw = {
"description": "CIRCLE ITEM",
"circlePrice": 2.99,
"salePrice": 3.49,
"unitPrice": 2.99,
"totalPrice": 2.99,
}
result = _parse_item(raw)
assert result["sale_price"] == Decimal("2.99")
def test_sale_price_fallback_when_no_circle_price(self):
raw = {
"description": "SALE ITEM",
"salePrice": 3.49,
"unitPrice": 3.49,
"totalPrice": 3.49,
}
result = _parse_item(raw)
assert result["sale_price"] == Decimal("3.49")
def test_circle_rewards_discount(self):
raw = {
"description": "CIRCLE REWARDS ITEM",
"circleRewardsDiscount": 1.50,
"unitPrice": 5.00,
"totalPrice": 5.00,
}
result = _parse_item(raw)
assert result["loyalty_discount"] == Decimal("1.50")
def test_circle_discount_fallback(self):
raw = {
"description": "CIRCLE DISC ITEM",
"circleDiscount": 0.75,
"unitPrice": 3.00,
"totalPrice": 3.00,
}
result = _parse_item(raw)
assert result["loyalty_discount"] == Decimal("0.75")
def test_bogo_item(self):
raw = {
"description": "BOGO GOOD & GATHER PASTA",
"upc": "0085239300456",
"quantity": 2,
"unitPrice": 1.79,
"totalPrice": 1.79,
"regularPrice": 1.79,
"circlePrice": 0.895,
"circleRewardsDiscount": 1.79,
"promoDescription": "Buy 1 get 1 free",
"department": "GROCERY",
}
result = _parse_item(raw)
assert result["quantity"] == Decimal("2")
assert result["unit_price"] == Decimal("1.79")
assert result["extended_price"] == Decimal("1.79")
assert result["sale_price"] == Decimal("0.895")
assert result["loyalty_discount"] == Decimal("1.79")
def test_multi_quantity_item(self):
raw = {
"description": "MARKET PANTRY EGGS",
"quantity": 2,
"unitPrice": 4.99,
"totalPrice": 9.98,
"department": "GROCERY",
}
result = _parse_item(raw)
assert result["quantity"] == Decimal("2")
assert result["unit_price"] == Decimal("4.99")
assert result["extended_price"] == Decimal("9.98")
def test_coupon_savings_field(self):
raw = {
"description": "COUPON ITEM",
"couponSavings": 1.00,
"unitPrice": 5.00,
"totalPrice": 5.00,
}
result = _parse_item(raw)
assert result["coupon_discount"] == Decimal("1.00")
class TestParseTargetReceipt:
def test_full_receipt(self, target_receipt_data):
raw = RawReceipt(
receipt_id="TGT-2026-0315-7890",
purchase_date="2026-03-15T11:23:00Z",
store_number="2774",
raw_data=target_receipt_data,
)
result = parse_target_receipt(raw)
assert result["receipt_id"] == "TGT-2026-0315-7890"
assert result["purchase_date"] == "2026-03-15T11:23:00Z"
assert result["total"] == Decimal("83.21")
assert result["subtotal"] == Decimal("78.32")
assert result["tax"] == Decimal("4.89")
assert result["savings_total"] == Decimal("11.45")
# Should have 8 items (voided + returned items excluded)
assert len(result["items"]) == 8
# Verify first item
milk = result["items"][0]
assert milk["product_name_raw"] == "GOOD & GATHER WHOLE MILK GAL"
assert milk["upc"] == "85239100123"
def test_voided_items_excluded(self, target_receipt_data):
raw = RawReceipt(
receipt_id="TGT-2026-0315-7890",
purchase_date="2026-03-15",
raw_data=target_receipt_data,
)
result = parse_target_receipt(raw)
item_names = [i["product_name_raw"] for i in result["items"]]
assert "VOIDED COCA-COLA 12PK" not in item_names
def test_returned_items_excluded(self, target_receipt_data):
raw = RawReceipt(
receipt_id="TGT-2026-0315-7890",
purchase_date="2026-03-15",
raw_data=target_receipt_data,
)
result = parse_target_receipt(raw)
item_names = [i["product_name_raw"] for i in result["items"]]
assert "RETURNED OLAY MOISTURIZER" not in item_names
def test_return_flag_items_excluded(self):
data = {
"detail": {
"items": [
{
"description": "NORMAL ITEM",
"unitPrice": 5.00,
"totalPrice": 5.00,
},
{
"description": "RETURNED VIA FLAG",
"unitPrice": 3.00,
"totalPrice": 3.00,
"returnFlag": True,
},
{
"description": "IS RETURN ITEM",
"unitPrice": 2.00,
"totalPrice": 2.00,
"isReturn": True,
},
],
"total": 5.00,
}
}
raw = RawReceipt(
receipt_id="RET-001",
purchase_date="2026-03-15",
raw_data=data,
)
result = parse_target_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "NORMAL ITEM"
def test_cancelled_items_excluded(self):
data = {
"detail": {
"items": [
{
"description": "NORMAL ITEM",
"unitPrice": 5.00,
"totalPrice": 5.00,
},
{
"description": "CANCELLED ITEM",
"unitPrice": 3.00,
"totalPrice": 3.00,
"status": "CANCELLED",
},
],
"total": 5.00,
}
}
raw = RawReceipt(
receipt_id="CAN-001",
purchase_date="2026-03-15",
raw_data=data,
)
result = parse_target_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "NORMAL ITEM"
def test_empty_receipt(self):
raw = RawReceipt(
receipt_id="EMPTY-001",
purchase_date="2026-03-15",
raw_data={"detail": {"items": [], "total": 0}},
)
result = parse_target_receipt(raw)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_receipt_with_no_detail(self):
raw = RawReceipt(
receipt_id="NO-DETAIL-001",
purchase_date="2026-03-15",
raw_data={"total": 50.00},
)
result = parse_target_receipt(raw)
assert result["items"] == []
assert result["total"] == Decimal("50.00")
def test_raw_data_preserved(self, target_receipt_data):
raw = RawReceipt(
receipt_id="TGT-2026-0315-7890",
purchase_date="2026-03-15",
raw_data=target_receipt_data,
)
result = parse_target_receipt(raw)
assert result["raw_data"] is target_receipt_data
def test_alternative_total_field_names(self):
raw = RawReceipt(
receipt_id="ALT-001",
purchase_date="2026-03-15",
raw_data={
"orderTotal": 42.00,
"subTotal": 35.00,
"salesTax": 3.50,
"circleSavings": 5.00,
"detail": {"items": []},
},
)
result = parse_target_receipt(raw)
assert result["total"] == Decimal("42.00")
assert result["subtotal"] == Decimal("35.00")
assert result["tax"] == Decimal("3.50")
assert result["savings_total"] == Decimal("5.00")
def test_receipt_items_alternative_key(self):
data = {
"detail": {
"lineItems": [
{
"description": "ALT KEY ITEM",
"unitPrice": 3.00,
"totalPrice": 3.00,
}
],
"total": 3.00,
}
}
raw = RawReceipt(
receipt_id="ALT-KEY-001",
purchase_date="2026-03-15",
raw_data=data,
)
result = parse_target_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "ALT KEY ITEM"
def test_source_url_preserved(self):
raw = RawReceipt(
receipt_id="URL-001",
purchase_date="2026-03-15",
raw_data={"detail": {"items": [], "total": 0}},
source_url="https://api.target.com/order_history/v1/orders/URL-001",
)
result = parse_target_receipt(raw)
assert result["source_url"] == "https://api.target.com/order_history/v1/orders/URL-001"
def test_weighted_items_in_full_receipt(self, target_receipt_data):
raw = RawReceipt(
receipt_id="TGT-2026-0315-7890",
purchase_date="2026-03-15",
raw_data=target_receipt_data,
)
result = parse_target_receipt(raw)
# Find the weighted turkey item
turkey = next(i for i in result["items"] if "TURKEY" in i["product_name_raw"])
assert turkey["quantity"] == Decimal("0.72")
assert turkey["unit_price"] == Decimal("10.99")
assert turkey["extended_price"] == Decimal("7.91")
def test_bogo_items_in_full_receipt(self, target_receipt_data):
raw = RawReceipt(
receipt_id="TGT-2026-0315-7890",
purchase_date="2026-03-15",
raw_data=target_receipt_data,
)
result = parse_target_receipt(raw)
# Find the BOGO pasta item
pasta = next(i for i in result["items"] if "BOGO" in i["product_name_raw"])
assert pasta["quantity"] == Decimal("2")
assert pasta["extended_price"] == Decimal("1.79")
assert pasta["loyalty_discount"] == Decimal("1.79")
def test_grand_total_field(self):
raw = RawReceipt(
receipt_id="GT-001",
purchase_date="2026-03-15",
raw_data={"grandTotal": 99.99, "detail": {"items": []}},
)
result = parse_target_receipt(raw)
assert result["total"] == Decimal("99.99")
View File
+23
View File
@@ -0,0 +1,23 @@
"""Shared test fixtures for pipeline tests."""
import pytest
from cartsnitch_common.models.base import Base
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
@pytest.fixture
def engine():
"""In-memory SQLite engine for unit tests."""
eng = create_engine("sqlite:///:memory:")
Base.metadata.create_all(eng)
yield eng
eng.dispose()
@pytest.fixture
def session(engine):
"""SQLAlchemy session bound to in-memory SQLite."""
factory = sessionmaker(bind=engine)
with factory() as sess:
yield sess
+161
View File
@@ -0,0 +1,161 @@
"""Tests for product matching & dedup pipeline."""
import uuid
from datetime import UTC, datetime
from decimal import Decimal
from cartsnitch_common.constants import MatchConfidence
from cartsnitch_common.models.product import NormalizedProduct
from cartsnitch_common.schemas.purchase import PurchaseItemCreate
from receiptwitness.pipeline.matching import (
ProductMatcher,
classify_confidence,
match_purchase_item,
)
from receiptwitness.pipeline.normalization import MatchMethod
class TestClassifyConfidence:
def test_upc_always_high(self):
assert classify_confidence(1.0, MatchMethod.UPC) == MatchConfidence.HIGH
assert classify_confidence(0.5, MatchMethod.UPC) == MatchConfidence.HIGH
def test_name_high(self):
assert classify_confidence(0.9, MatchMethod.NAME) == MatchConfidence.HIGH
assert classify_confidence(0.8, MatchMethod.NAME) == MatchConfidence.HIGH
def test_name_medium(self):
assert classify_confidence(0.6, MatchMethod.NAME) == MatchConfidence.MEDIUM
assert classify_confidence(0.5, MatchMethod.NAME) == MatchConfidence.MEDIUM
def test_name_low(self):
assert classify_confidence(0.3, MatchMethod.NAME) == MatchConfidence.LOW
assert classify_confidence(0.0, MatchMethod.NAME) == MatchConfidence.LOW
class TestProductMatcher:
def _make_item(self, name: str, upc: str | None = None) -> PurchaseItemCreate:
return PurchaseItemCreate(
product_name_raw=name,
upc=upc,
unit_price=Decimal("3.99"),
extended_price=Decimal("3.99"),
)
def test_match_by_upc(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Whole Milk Gallon",
upc_variants=["041250000001"],
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
matcher = ProductMatcher(session)
item = self._make_item("Kroger Milk", upc="041250000001")
prod, result, confidence = matcher.match_single(item)
assert prod is not None
assert prod.id == product.id
assert result is not None
assert result.method == MatchMethod.UPC
assert confidence == MatchConfidence.HIGH
def test_match_by_name(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Whole Milk Gallon",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
matcher = ProductMatcher(session, name_threshold=0.3)
item = self._make_item("Whole Milk Gallon Size")
prod, result, confidence = matcher.match_single(item)
assert prod is not None
assert result is not None
assert result.method == MatchMethod.NAME
def test_auto_create_when_no_match(self, session):
matcher = ProductMatcher(session, auto_create=True)
item = self._make_item("Unique Product XYZ 16 oz")
prod, result, confidence = matcher.match_single(item)
assert prod is not None
assert result is None # No match found, was created
assert confidence == MatchConfidence.LOW
assert prod.canonical_name == "Unique Product XYZ 16 oz"
assert prod.size == "16"
assert prod.size_unit == "oz"
def test_no_create_when_disabled(self, session):
matcher = ProductMatcher(session, auto_create=False)
item = self._make_item("Nonexistent Product")
prod, result, confidence = matcher.match_single(item)
assert prod is None
assert result is None
def test_batch_match(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Large Eggs 12 Count",
upc_variants=["012345"],
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
matcher = ProductMatcher(session)
items = [
self._make_item("Large Eggs", upc="012345"),
self._make_item("Brand New Never Seen Product"),
]
outcomes = matcher.match_items(items)
assert len(outcomes) == 2
assert outcomes[0].match is not None
assert outcomes[0].confidence_level == MatchConfidence.HIGH
assert outcomes[0].created_new is False
assert outcomes[1].match is None
assert outcomes[1].created_new is True
class TestMatchPurchaseItem:
def test_convenience_function(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Ground Beef 80/20",
upc_variants=["999888"],
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
item = PurchaseItemCreate(
product_name_raw="Ground Beef",
upc="999888",
unit_price=Decimal("5.99"),
extended_price=Decimal("5.99"),
)
prod, confidence = match_purchase_item(session, item)
assert prod is not None
assert confidence == MatchConfidence.HIGH
def test_auto_create_default(self, session):
item = PurchaseItemCreate(
product_name_raw="Totally New Item",
unit_price=Decimal("1.00"),
extended_price=Decimal("1.00"),
)
prod, confidence = match_purchase_item(session, item)
assert prod is not None
assert confidence == MatchConfidence.LOW
+158
View File
@@ -0,0 +1,158 @@
"""Tests for product normalization module."""
import uuid
from datetime import UTC, datetime
from cartsnitch_common.models.product import NormalizedProduct
from receiptwitness.pipeline.normalization import (
MatchMethod,
clean_name,
extract_size_info,
jaccard_similarity,
match_by_name,
match_by_upc,
normalize_product,
)
class TestCleanName:
def test_lowercase(self):
assert clean_name("Kroger WHOLE MILK") == "kroger whole milk"
def test_removes_size_info(self):
assert "oz" not in clean_name("Milk 16 oz Whole")
def test_removes_noise_words(self):
cleaned = clean_name("The Original Brand Milk")
assert "the" not in cleaned.split()
assert "original" not in cleaned.split()
assert "brand" not in cleaned.split()
def test_collapses_whitespace(self):
assert " " not in clean_name("Milk Whole Gallon")
def test_removes_punctuation(self):
cleaned = clean_name("Meijer's Best (Organic) Milk!")
assert "'" not in cleaned
assert "(" not in cleaned
class TestExtractSizeInfo:
def test_extracts_oz(self):
result = extract_size_info("Cereal 18 oz box")
assert result == ("18", "oz")
def test_extracts_fl_oz(self):
result = extract_size_info("Juice 64 fl oz")
assert result == ("64", "fl_oz")
def test_extracts_lb(self):
result = extract_size_info("Ground Beef 1.5 lb")
assert result == ("1.5", "lb")
def test_extracts_ct(self):
result = extract_size_info("Eggs Large 12 ct")
assert result == ("12", "ct")
def test_no_size_returns_none(self):
assert extract_size_info("Bananas") is None
class TestJaccardSimilarity:
def test_identical_strings(self):
assert jaccard_similarity("whole milk gallon", "whole milk gallon") == 1.0
def test_completely_different(self):
assert jaccard_similarity("apple juice", "ground beef") == 0.0
def test_partial_overlap(self):
score = jaccard_similarity("kroger whole milk", "meijer whole milk")
assert 0.4 < score < 0.8 # "whole" and "milk" overlap
def test_empty_strings(self):
assert jaccard_similarity("", "") == 0.0
assert jaccard_similarity("milk", "") == 0.0
class TestMatchByUPC:
def test_match_found(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Whole Milk, Gallon",
upc_variants=["0041250000001", "0041250000002"],
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
# SQLite doesn't support JSONB containment — this will raise
# In production (PostgreSQL), this would work
result = match_by_upc(session, "0041250000001")
assert result is not None
assert result.method == MatchMethod.UPC
assert result.confidence == 1.0
def test_no_match(self, session):
result = match_by_upc(session, "9999999999999")
assert result is None
class TestMatchByName:
def test_exact_name_match(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Whole Milk, Gallon",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = match_by_name(session, "Whole Milk Gallon")
assert result is not None
assert result.method == MatchMethod.NAME
assert result.confidence > 0.5
def test_fuzzy_match(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Kroger Whole Milk, 1 Gallon",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = match_by_name(session, "Meijer Whole Milk 1 Gallon", threshold=0.3)
assert result is not None
assert result.confidence > 0.3
def test_no_match_below_threshold(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Ground Beef 80/20",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = match_by_name(session, "Apple Juice 64 oz", threshold=0.5)
assert result is None
class TestNormalizeProduct:
def test_name_fallback(self, session):
product = NormalizedProduct(
id=uuid.uuid4(),
canonical_name="Large Eggs, 12 count",
created_at=datetime.now(UTC),
updated_at=datetime.now(UTC),
)
session.add(product)
session.commit()
result = normalize_product(session, "Large Eggs 12 ct", upc=None)
assert result is not None
assert result.method == MatchMethod.NAME
def test_no_match(self, session):
result = normalize_product(session, "Nonexistent Product XYZ", upc=None)
assert result is None
+204
View File
@@ -0,0 +1,204 @@
"""Tests for receipt normalization pipeline."""
import uuid
from datetime import date
from decimal import Decimal
from receiptwitness.pipeline.receipt import (
_clean_product_name,
_safe_decimal,
normalize_receipt,
parse_meijer_item,
)
class TestCleanProductName:
def test_strips_whitespace(self):
assert _clean_product_name(" Milk ") == "Milk"
def test_removes_leading_punctuation(self):
assert _clean_product_name("---Milk---") == "Milk"
def test_collapses_internal_whitespace(self):
assert _clean_product_name("Whole Milk Gallon") == "Whole Milk Gallon"
def test_empty_string(self):
assert _clean_product_name("") == ""
class TestSafeDecimal:
def test_string_input(self):
assert _safe_decimal("3.99") == Decimal("3.99")
def test_float_input(self):
assert _safe_decimal(3.99) == Decimal("3.99")
def test_int_input(self):
assert _safe_decimal(4) == Decimal("4")
def test_none_returns_default(self):
assert _safe_decimal(None) == Decimal("0")
def test_none_custom_default(self):
assert _safe_decimal(None, Decimal("1")) == Decimal("1")
def test_invalid_returns_default(self):
assert _safe_decimal("not-a-number") == Decimal("0")
def test_decimal_passthrough(self):
assert _safe_decimal(Decimal("5.50")) == Decimal("5.50")
class TestParseMeijerItem:
def test_basic_item(self):
raw = {
"description": "Kroger Whole Milk 1 Gallon",
"upc": "0041250000001",
"quantity": 1,
"unitPrice": "3.99",
"extendedPrice": "3.99",
"category": "DAIRY",
}
item = parse_meijer_item(raw)
assert item.product_name_raw == "Kroger Whole Milk 1 Gallon"
assert item.upc == "41250000001" # leading zeros stripped
assert item.quantity == Decimal("1")
assert item.unit_price == Decimal("3.99")
assert item.extended_price == Decimal("3.99")
assert item.category_raw == "DAIRY"
def test_alternate_field_names(self):
raw = {
"name": "Eggs Large 12 ct",
"upcCode": "012345",
"qty": 2,
"price": "4.50",
"totalPrice": "9.00",
"department": "EGGS",
}
item = parse_meijer_item(raw)
assert item.product_name_raw == "Eggs Large 12 ct"
assert item.upc == "12345"
assert item.quantity == Decimal("2")
assert item.unit_price == Decimal("4.50")
assert item.extended_price == Decimal("9.00")
assert item.category_raw == "EGGS"
def test_calculates_extended_from_unit_price(self):
raw = {
"description": "Bananas",
"unitPrice": "0.59",
"quantity": 3,
}
item = parse_meijer_item(raw)
assert item.extended_price == Decimal("1.77")
def test_discounts_parsed(self):
raw = {
"description": "Cereal",
"unitPrice": "4.99",
"extendedPrice": "4.99",
"regularPrice": "5.99",
"salePrice": "4.99",
"couponAmount": "1.00",
"loyaltyAmount": "0.50",
}
item = parse_meijer_item(raw)
assert item.regular_price == Decimal("5.99")
assert item.sale_price == Decimal("4.99")
assert item.coupon_discount == Decimal("1.00")
assert item.loyalty_discount == Decimal("0.50")
def test_alternate_discount_names(self):
raw = {
"description": "Bread",
"unitPrice": "2.99",
"extendedPrice": "2.99",
"couponDiscount": "0.75",
"loyaltyDiscount": "0.25",
}
item = parse_meijer_item(raw)
assert item.coupon_discount == Decimal("0.75")
assert item.loyalty_discount == Decimal("0.25")
def test_missing_fields_default_gracefully(self):
raw = {"description": "Mystery Item"}
item = parse_meijer_item(raw)
assert item.product_name_raw == "Mystery Item"
assert item.upc is None
assert item.quantity == Decimal("1")
assert item.unit_price == Decimal("0")
assert item.regular_price is None
assert item.category_raw is None
def test_no_upc_returns_none(self):
raw = {"description": "Loose Bananas", "unitPrice": "1.00", "extendedPrice": "1.00"}
item = parse_meijer_item(raw)
assert item.upc is None
class TestNormalizeReceipt:
def test_full_receipt(self):
user_id = str(uuid.uuid4())
store_id = str(uuid.uuid4())
raw = {
"receiptId": "REC-001",
"date": "2026-03-15",
"total": "25.47",
"subtotal": "23.00",
"tax": "2.47",
"savings": "3.00",
"items": [
{"description": "Milk", "unitPrice": "3.99", "extendedPrice": "3.99"},
{"description": "Bread", "unitPrice": "2.50", "extendedPrice": "2.50"},
],
}
purchase = normalize_receipt(raw, user_id, store_id)
assert purchase.receipt_id == "REC-001"
assert purchase.purchase_date == date(2026, 3, 15)
assert purchase.total == Decimal("25.47")
assert purchase.subtotal == Decimal("23.00")
assert purchase.tax == Decimal("2.47")
assert purchase.savings_total == Decimal("3.00")
assert len(purchase.items) == 2
assert purchase.items[0].product_name_raw == "Milk"
assert purchase.raw_data == raw
def test_alternate_receipt_fields(self):
user_id = str(uuid.uuid4())
store_id = str(uuid.uuid4())
raw = {
"receipt_id": "REC-002",
"purchaseDate": "2026-03-14",
"totalAmount": "10.00",
"taxAmount": "0.75",
"totalSavings": "1.50",
"items": [],
}
purchase = normalize_receipt(raw, user_id, store_id)
assert purchase.receipt_id == "REC-002"
assert purchase.purchase_date == date(2026, 3, 14)
assert purchase.total == Decimal("10.00")
assert purchase.tax == Decimal("0.75")
assert purchase.savings_total == Decimal("1.50")
def test_missing_date_defaults_to_today(self):
user_id = str(uuid.uuid4())
store_id = str(uuid.uuid4())
raw = {"total": "5.00", "items": []}
purchase = normalize_receipt(raw, user_id, store_id)
assert purchase.purchase_date == date.today()
def test_generates_receipt_id_if_missing(self):
user_id = str(uuid.uuid4())
store_id = str(uuid.uuid4())
raw = {"total": "5.00", "date": "2026-03-15", "items": []}
purchase = normalize_receipt(raw, user_id, store_id)
assert purchase.receipt_id # Should be a generated UUID string
def test_date_object_passthrough(self):
user_id = str(uuid.uuid4())
store_id = str(uuid.uuid4())
raw = {"date": date(2026, 1, 1), "total": "5.00", "items": []}
purchase = normalize_receipt(raw, user_id, store_id)
assert purchase.purchase_date == date(2026, 1, 1)
View File
@@ -0,0 +1,435 @@
"""Regression tests: graceful handling of page layout changes.
Retailers frequently change their API response structures, field names,
and nesting. These tests verify that both parsers degrade gracefully when
encountering alternative or missing fields — producing valid output
instead of crashing.
"""
from decimal import Decimal
from receiptwitness.parsers.kroger import parse_kroger_receipt
from receiptwitness.parsers.meijer import parse_meijer_receipt
from receiptwitness.scrapers.base import RawReceipt
class TestKrogerFieldNameVariations:
"""Kroger changes field names between app versions and API revisions."""
def test_alternative_item_key_line_items(self):
raw = RawReceipt(
receipt_id="KR-ALT-1",
purchase_date="2026-03-12",
raw_data={
"detail": {
"lineItems": [{"description": "MILK", "basePrice": 3.99, "totalPrice": 3.99}],
"total": 3.99,
}
},
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "MILK"
def test_alternative_item_key_receipt_items(self):
raw = RawReceipt(
receipt_id="KR-ALT-2",
purchase_date="2026-03-12",
raw_data={
"detail": {
"receiptItems": [
{"description": "EGGS", "basePrice": 5.49, "totalPrice": 5.49}
],
"total": 5.49,
}
},
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "EGGS"
def test_alternative_description_fields(self):
"""Test productName and itemDescription fallbacks."""
for field in ("productName", "itemDescription", "name"):
raw = RawReceipt(
receipt_id="KR-DESC",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{field: "TEST PRODUCT", "basePrice": 1.00, "totalPrice": 1.00}],
"total": 1.00,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["product_name_raw"] == "TEST PRODUCT"
def test_alternative_price_fields(self):
"""Test unitPrice and price fallbacks for basePrice."""
raw = RawReceipt(
receipt_id="KR-PRICE-1",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{"description": "ITEM A", "unitPrice": 2.50, "totalPrice": 2.50}],
"total": 2.50,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["unit_price"] == Decimal("2.50")
raw2 = RawReceipt(
receipt_id="KR-PRICE-2",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{"description": "ITEM B", "price": 4.00, "totalPrice": 4.00}],
"total": 4.00,
}
},
)
result2 = parse_kroger_receipt(raw2)
assert result2["items"][0]["unit_price"] == Decimal("4.00")
def test_alternative_total_fields(self):
"""Test orderTotal, grandTotal fallbacks."""
for field in ("orderTotal", "grandTotal"):
raw = RawReceipt(
receipt_id="KR-TOT",
purchase_date="2026-03-12",
raw_data={field: 42.50, "detail": {}},
)
result = parse_kroger_receipt(raw)
assert result["total"] == Decimal("42.50")
def test_alternative_savings_fields(self):
"""Test youSaved and totalDiscount fallbacks."""
raw = RawReceipt(
receipt_id="KR-SAV-1",
purchase_date="2026-03-12",
raw_data={"youSaved": 5.00, "detail": {}},
)
result = parse_kroger_receipt(raw)
assert result["savings_total"] == Decimal("5.00")
def test_alternative_tax_field(self):
raw = RawReceipt(
receipt_id="KR-TAX",
purchase_date="2026-03-12",
raw_data={"salesTax": 3.25, "detail": {}},
)
result = parse_kroger_receipt(raw)
assert result["tax"] == Decimal("3.25")
def test_alternative_quantity_field_qty(self):
raw = RawReceipt(
receipt_id="KR-QTY",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{"description": "APPLES", "qty": 5, "basePrice": 1.00, "totalPrice": 5.00}
],
"total": 5.00,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["quantity"] == Decimal("5")
def test_alternative_upc_field_kroger_product_id(self):
raw = RawReceipt(
receipt_id="KR-UPC",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"krogerProductId": "12345678",
"basePrice": 1.00,
"totalPrice": 1.00,
}
],
"total": 1.00,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["upc"] == "12345678"
def test_missing_extended_price_computed(self):
"""When totalPrice is missing, extended_price = unit_price * quantity."""
raw = RawReceipt(
receipt_id="KR-CALC",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [{"description": "EGGS", "basePrice": 5.49, "quantity": 2}],
"total": 10.98,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["extended_price"] == Decimal("5.49") * Decimal("2")
class TestMeijerFieldNameVariations:
"""Meijer XHR endpoints may change field names between SPA versions."""
def test_alternative_item_key_line_items(self):
raw = RawReceipt(
receipt_id="MJ-ALT-1",
purchase_date="2026-03-10",
raw_data={
"detail": {
"lineItems": [{"description": "BANANAS", "price": 0.69, "extendedPrice": 0.69}],
"total": 0.69,
}
},
)
result = parse_meijer_receipt(raw)
assert len(result["items"]) == 1
assert result["items"][0]["product_name_raw"] == "BANANAS"
def test_alternative_description_fields(self):
for field in ("itemDescription", "name"):
raw = RawReceipt(
receipt_id="MJ-DESC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{field: "TEST ITEM", "price": 1.00, "extendedPrice": 1.00}],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["product_name_raw"] == "TEST ITEM"
def test_alternative_price_field_unit_price(self):
raw = RawReceipt(
receipt_id="MJ-PRICE",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"description": "MILK", "unitPrice": 3.49, "totalPrice": 3.49}],
"total": 3.49,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["unit_price"] == Decimal("3.49")
def test_alternative_extended_price_field_total_price(self):
raw = RawReceipt(
receipt_id="MJ-EXT",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"description": "CEREAL", "price": 4.99, "totalPrice": 4.99}],
"total": 4.99,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["extended_price"] == Decimal("4.99")
def test_alternative_total_field_transaction_total(self):
raw = RawReceipt(
receipt_id="MJ-TOT",
purchase_date="2026-03-10",
raw_data={"transactionTotal": 55.00, "detail": {}},
)
result = parse_meijer_receipt(raw)
assert result["total"] == Decimal("55.00")
def test_alternative_loyalty_field(self):
raw = RawReceipt(
receipt_id="MJ-LOY",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"price": 5.00,
"extendedPrice": 5.00,
"loyaltyDiscount": 0.50,
}
],
"total": 5.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["loyalty_discount"] == Decimal("0.50")
def test_alternative_upc_field_uppercase(self):
raw = RawReceipt(
receipt_id="MJ-UPC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"UPC": "0012345678",
"price": 1.00,
"extendedPrice": 1.00,
}
],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["upc"] == "12345678"
def test_alternative_category_field(self):
raw = RawReceipt(
receipt_id="MJ-CAT",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"price": 1.00,
"extendedPrice": 1.00,
"departmentDescription": "FROZEN",
}
],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["category_raw"] == "FROZEN"
def test_missing_extended_price_computed(self):
raw = RawReceipt(
receipt_id="MJ-CALC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"description": "MILK", "price": 3.49, "quantity": 2}],
"total": 6.98,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["extended_price"] == Decimal("3.49") * Decimal("2")
def test_missing_description_fallback(self):
raw = RawReceipt(
receipt_id="MJ-NODESC",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [{"price": 1.00, "extendedPrice": 1.00}],
"total": 1.00,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["product_name_raw"] == "UNKNOWN ITEM"
class TestMixedFieldVersions:
"""Test receipts that mix field naming conventions (happens during rollouts)."""
def test_kroger_mixed_item_fields(self):
"""Some items use old names, some use new names in same receipt."""
raw = RawReceipt(
receipt_id="KR-MIX",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{"description": "OLD STYLE", "basePrice": 2.00, "totalPrice": 2.00},
{"productName": "NEW STYLE", "unitPrice": 3.00, "extendedAmount": 3.00},
],
"total": 5.00,
}
},
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 2
assert result["items"][0]["product_name_raw"] == "OLD STYLE"
assert result["items"][0]["unit_price"] == Decimal("2.00")
assert result["items"][1]["product_name_raw"] == "NEW STYLE"
assert result["items"][1]["unit_price"] == Decimal("3.00")
def test_kroger_completely_unknown_structure_no_crash(self):
"""Receipt with unrecognized structure should return empty items."""
raw = RawReceipt(
receipt_id="KR-UNKNOWN",
purchase_date="2026-03-12",
raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}},
)
result = parse_kroger_receipt(raw)
assert result["receipt_id"] == "KR-UNKNOWN"
assert result["items"] == []
def test_meijer_completely_unknown_structure_no_crash(self):
raw = RawReceipt(
receipt_id="MJ-UNKNOWN",
purchase_date="2026-03-10",
raw_data={"something_unexpected": [1, 2, 3], "detail": {"foo": "bar"}},
)
result = parse_meijer_receipt(raw)
assert result["receipt_id"] == "MJ-UNKNOWN"
assert result["items"] == []
def test_kroger_null_fields_no_crash(self):
"""Fields with None values should be handled gracefully."""
raw = RawReceipt(
receipt_id="KR-NULL",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"basePrice": None,
"totalPrice": None,
"quantity": None,
"upc": None,
"department": None,
}
],
"total": None,
"subtotal": None,
"tax": None,
}
},
)
result = parse_kroger_receipt(raw)
assert result["items"][0]["product_name_raw"] == "ITEM"
assert result["items"][0]["unit_price"] == Decimal("0")
def test_meijer_null_fields_no_crash(self):
raw = RawReceipt(
receipt_id="MJ-NULL",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "ITEM",
"price": None,
"extendedPrice": None,
"quantity": None,
"upc": None,
"category": None,
}
],
"total": None,
}
},
)
result = parse_meijer_receipt(raw)
assert result["items"][0]["product_name_raw"] == "ITEM"
assert result["items"][0]["unit_price"] == Decimal("0")
+365
View File
@@ -0,0 +1,365 @@
"""Regression tests: rate limiting and retry behavior.
Validates that scrapers enforce human-like delays between requests
and handle rate-limit/error responses gracefully without infinite retries.
"""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, patch
import pytest
from receiptwitness.scrapers.base import SessionData
from receiptwitness.scrapers.kroger import DEFAULT_USER_AGENT, KrogerScraper
from receiptwitness.scrapers.meijer import MeijerScraper
class TestHumanDelayBehavior:
"""Verify that human_delay respects configured bounds."""
@pytest.mark.asyncio
async def test_delay_within_bounds(self):
"""human_delay should sleep between min_ms/1000 and max_ms/1000 seconds."""
scraper = KrogerScraper()
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
with patch(sleep_path, new_callable=AsyncMock) as mock_sleep:
await scraper.human_delay(100, 200)
mock_sleep.assert_called_once()
delay = mock_sleep.call_args[0][0]
assert 0.1 <= delay <= 0.2
@pytest.mark.asyncio
async def test_delay_uses_settings_defaults(self):
"""Without explicit args, should use settings.min/max_request_delay_ms."""
scraper = MeijerScraper()
sleep_path = "receiptwitness.scrapers.base.asyncio.sleep"
with (
patch("receiptwitness.scrapers.base.settings") as mock_settings,
patch(sleep_path, new_callable=AsyncMock) as mock_sleep,
):
mock_settings.min_request_delay_ms = 1000
mock_settings.max_request_delay_ms = 5000
await scraper.human_delay()
mock_sleep.assert_called_once()
delay = mock_sleep.call_args[0][0]
assert 1.0 <= delay <= 5.0
@pytest.mark.asyncio
async def test_delay_is_randomized(self):
"""Multiple calls should produce different delays (probabilistic)."""
scraper = KrogerScraper()
delays = []
sleep_path2 = "receiptwitness.scrapers.base.asyncio.sleep"
with patch(sleep_path2, new_callable=AsyncMock) as mock_sleep:
for _ in range(20):
await scraper.human_delay(100, 5000)
delays.append(mock_sleep.call_args[0][0])
# With range 100-5000ms, 20 calls should have at least 2 distinct values
assert len(set(delays)) >= 2
class TestKrogerRateLimiting:
"""Verify Kroger scraper calls human_delay between receipt fetches."""
@pytest.mark.asyncio
async def test_delay_called_between_receipts(self):
"""Scraper must call human_delay for each receipt detail fetch."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{
"orderId": f"KR-{i}",
"purchaseDate": "2026-03-10T14:00:00Z",
"storeNumber": "357",
}
for i in range(3)
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 3
# human_delay called at least once per receipt (after initial page nav)
# Plus once for the initial navigation delay
assert mock_delay.call_count >= 3
class TestMeijerRateLimiting:
"""Verify Meijer scraper calls human_delay between receipt fetches."""
@pytest.mark.asyncio
async def test_delay_called_between_receipts(self):
scraper = MeijerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
user_agent="test",
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": f"TXN-{i}",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
}
for i in range(3)
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response] + [mock_detail_response] * 3)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock) as mock_delay,
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 3
assert mock_delay.call_count >= 3
class TestGracefulErrorRecovery:
"""Scrapers should not retry endlessly on errors."""
@pytest.mark.asyncio
async def test_kroger_api_500_returns_empty_not_retry(self):
"""500 error should return empty list, not retry."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 500
mock_api_response.status_text = "Internal Server Error"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
# Should only call the API once — no retries
assert mock_request.get.call_count == 1
@pytest.mark.asyncio
async def test_kroger_429_returns_empty_not_retry(self):
"""Rate limit (429) should return empty, not retry."""
scraper = KrogerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
)
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 429
mock_api_response.status_text = "Too Many Requests"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
assert mock_request.get.call_count == 1
@pytest.mark.asyncio
async def test_meijer_detail_exception_continues(self):
"""Exception fetching one receipt detail should not abort remaining receipts."""
scraper = MeijerScraper()
valid_session = SessionData(
cookies=[{"name": "s", "value": "v", "domain": ".meijer.com", "path": "/"}],
user_agent="test",
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": "TXN-1",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
},
{
"transactionId": "TXN-2",
"transactionDate": "2026-03-11T10:00:00Z",
"storeNumber": "42",
},
]
}
)
# First detail call raises exception, second succeeds
mock_detail_fail = AsyncMock()
mock_detail_fail.ok = False
mock_detail_fail.status = 500
mock_detail_ok = AsyncMock()
mock_detail_ok.ok = True
mock_detail_ok.json = AsyncMock(return_value={"items": []})
mock_request = AsyncMock()
mock_request.get = AsyncMock(
side_effect=[mock_api_response, mock_detail_fail, mock_detail_ok]
)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
# Both receipts should be returned — the first with empty detail
assert len(receipts) == 2
assert receipts[0].raw_data.get("detail") == {}
assert receipts[1].receipt_id == "TXN-2"
@@ -0,0 +1,364 @@
"""Regression tests: scraper output matches expected schema.
Validates that parsed receipts from both Kroger and Meijer conform to the
PurchaseCreate schema contract. Uses recorded fixtures to ensure outputs
remain stable across code changes.
"""
from decimal import Decimal
from receiptwitness.parsers.kroger import parse_kroger_receipt
from receiptwitness.parsers.meijer import parse_meijer_receipt
from receiptwitness.scrapers.base import RawReceipt
# Required top-level keys in a parsed receipt
RECEIPT_REQUIRED_KEYS = {"receipt_id", "purchase_date", "total", "items", "raw_data"}
RECEIPT_OPTIONAL_KEYS = {"subtotal", "tax", "savings_total", "source_url"}
# Required keys in each parsed item
ITEM_REQUIRED_KEYS = {
"product_name_raw",
"upc",
"quantity",
"unit_price",
"extended_price",
}
ITEM_OPTIONAL_KEYS = {
"regular_price",
"sale_price",
"coupon_discount",
"loyalty_discount",
"category_raw",
}
def _validate_receipt_schema(result: dict) -> None:
"""Assert that a parsed receipt dict conforms to the expected schema."""
# All required keys present
for key in RECEIPT_REQUIRED_KEYS:
assert key in result, f"Missing required key: {key}"
# Types
assert isinstance(result["receipt_id"], str)
assert isinstance(result["purchase_date"], str)
assert isinstance(result["total"], Decimal)
assert isinstance(result["items"], list)
assert isinstance(result["raw_data"], dict)
# Optional keys should be correct types when present
if result.get("subtotal") is not None:
assert isinstance(result["subtotal"], Decimal)
if result.get("tax") is not None:
assert isinstance(result["tax"], Decimal)
if result.get("savings_total") is not None:
assert isinstance(result["savings_total"], Decimal)
if result.get("source_url") is not None:
assert isinstance(result["source_url"], str)
# No unexpected keys
all_keys = RECEIPT_REQUIRED_KEYS | RECEIPT_OPTIONAL_KEYS
for key in result:
assert key in all_keys, f"Unexpected key in receipt: {key}"
def _validate_item_schema(item: dict) -> None:
"""Assert that a parsed item dict conforms to the expected schema."""
for key in ITEM_REQUIRED_KEYS:
assert key in item, f"Missing required item key: {key}"
assert isinstance(item["product_name_raw"], str)
assert len(item["product_name_raw"]) > 0
assert isinstance(item["quantity"], Decimal)
assert isinstance(item["unit_price"], Decimal)
assert isinstance(item["extended_price"], Decimal)
# UPC can be None or str
if item["upc"] is not None:
assert isinstance(item["upc"], str)
# UPC should not have leading zeros (stripped during parsing)
assert not item["upc"].startswith("0"), f"UPC has leading zeros: {item['upc']}"
# Optional Decimal fields
for opt_key in ("regular_price", "sale_price", "coupon_discount", "loyalty_discount"):
if item.get(opt_key) is not None:
assert isinstance(item[opt_key], Decimal), f"{opt_key} should be Decimal"
if item.get("category_raw") is not None:
assert isinstance(item["category_raw"], str)
# No unexpected keys
all_keys = ITEM_REQUIRED_KEYS | ITEM_OPTIONAL_KEYS
for key in item:
assert key in all_keys, f"Unexpected key in item: {key}"
class TestKrogerSchemaValidation:
def test_full_receipt_schema(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
store_number="00357",
raw_data=kroger_receipt_data,
source_url="https://www.kroger.com/atlas/v1/receipt/api?orderId=KR-2026-0312-4471",
)
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
for item in result["items"]:
_validate_item_schema(item)
def test_item_count_excludes_voided_and_returned(self, kroger_receipt_data):
"""Fixture has 10 items, 2 should be excluded (voided + returned)."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert len(result["items"]) == 8
def test_totals_are_positive_decimals(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert result["total"] > Decimal("0")
assert result["subtotal"] > Decimal("0")
assert result["tax"] > Decimal("0")
assert result["savings_total"] > Decimal("0")
def test_receipt_id_preserved(self, kroger_receipt_data):
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
assert result["receipt_id"] == "KR-2026-0312-4471"
def test_known_product_prices(self, kroger_receipt_data):
"""Verify specific products produce correct price extraction."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
items_by_name = {i["product_name_raw"]: i for i in result["items"]}
# Milk: $3.99, regular $4.29
milk = items_by_name["KROGER WHOLE MILK GAL"]
assert milk["unit_price"] == Decimal("3.99")
assert milk["regular_price"] == Decimal("4.29")
assert milk["sale_price"] == Decimal("3.99")
# Eggs: qty 2, $5.49 each, total $10.98
eggs = items_by_name["SIMPLE TRUTH ORG EGGS 12CT"]
assert eggs["quantity"] == Decimal("2")
assert eggs["unit_price"] == Decimal("5.49")
assert eggs["extended_price"] == Decimal("10.98")
# Deli turkey: weighted item, 0.68 lb
turkey = items_by_name["KROGER DELI TURKEY BREAST"]
assert turkey["quantity"] == Decimal("0.68")
assert turkey["upc"] is None
def test_multi_quantity_item_correct(self, kroger_receipt_data):
"""Pasta is qty=3, unit=$2.49, total=$7.47."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
pasta = [i for i in result["items"] if "PASTA" in i["product_name_raw"]][0]
assert pasta["quantity"] == Decimal("3")
assert pasta["unit_price"] == Decimal("2.49")
assert pasta["extended_price"] == Decimal("7.47")
def test_coupon_discount_captured(self, kroger_receipt_data):
"""Tide Pods has $2.00 coupon."""
raw = RawReceipt(
receipt_id="KR-2026-0312-4471",
purchase_date="2026-03-12T16:45:00Z",
raw_data=kroger_receipt_data,
)
result = parse_kroger_receipt(raw)
tide = [i for i in result["items"] if "TIDE" in i["product_name_raw"]][0]
assert tide["coupon_discount"] == Decimal("2.00")
class TestMeijerSchemaValidation:
def test_full_receipt_schema(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
store_number="42",
raw_data=meijer_receipt_data,
source_url="https://www.meijer.com/bin/meijer/profile/receipt?receiptId=TXN-2026-0310-001",
)
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
for item in result["items"]:
_validate_item_schema(item)
def test_item_count_excludes_voided(self, meijer_receipt_data):
"""Fixture has 6 items, 1 should be excluded (voided soda)."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert len(result["items"]) == 5
def test_totals_are_positive_decimals(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert result["total"] > Decimal("0")
assert result["subtotal"] > Decimal("0")
assert result["tax"] > Decimal("0")
assert result["savings_total"] > Decimal("0")
def test_receipt_id_preserved(self, meijer_receipt_data):
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
assert result["receipt_id"] == "TXN-2026-0310-001"
def test_known_product_prices(self, meijer_receipt_data):
"""Verify specific Meijer products produce correct price extraction."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
items_by_name = {i["product_name_raw"]: i for i in result["items"]}
# Bananas: $0.69
bananas = items_by_name["ORGANIC BANANAS"]
assert bananas["unit_price"] == Decimal("0.69")
assert bananas["mperks_discount"] if "mperks_discount" in bananas else True
assert bananas["loyalty_discount"] == Decimal("0.10")
# Milk: qty 2, $3.49 each, total $6.98
milk = items_by_name["MEIJER 2% MILK GAL"]
assert milk["quantity"] == Decimal("2")
assert milk["unit_price"] == Decimal("3.49")
assert milk["extended_price"] == Decimal("6.98")
# Weighted deli turkey: 0.75 lb at $8.99/lb
turkey = items_by_name["WEIGHTED DELI TURKEY"]
assert turkey["quantity"] == Decimal("0.75")
assert turkey["upc"] is None
def test_mperks_discount_captured(self, meijer_receipt_data):
"""Paper towels has $1.00 mPerks discount."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
towels = [i for i in result["items"] if "PAPER TOWELS" in i["product_name_raw"]][0]
assert towels["loyalty_discount"] == Decimal("1.00")
assert towels["coupon_discount"] == Decimal("1.00")
def test_cheerios_coupon_discount(self, meijer_receipt_data):
"""Cheerios has $0.50 coupon."""
raw = RawReceipt(
receipt_id="TXN-2026-0310-001",
purchase_date="2026-03-10T14:30:00Z",
raw_data=meijer_receipt_data,
)
result = parse_meijer_receipt(raw)
cheerios = [i for i in result["items"] if "CHEERIOS" in i["product_name_raw"]][0]
assert cheerios["coupon_discount"] == Decimal("0.50")
class TestEmptyAndEdgeCaseSchemas:
"""Regression tests for edge-case receipts that should not crash."""
def test_kroger_empty_receipt(self):
raw = RawReceipt(receipt_id="KR-EMPTY", purchase_date="2026-03-12", raw_data={})
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_meijer_empty_receipt(self):
raw = RawReceipt(receipt_id="MJ-EMPTY", purchase_date="2026-03-10", raw_data={})
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("0")
def test_kroger_receipt_no_detail(self):
raw = RawReceipt(
receipt_id="KR-NODET",
purchase_date="2026-03-12",
raw_data={"total": 50.00},
)
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("50.00")
def test_meijer_receipt_no_detail(self):
raw = RawReceipt(
receipt_id="MJ-NODET",
purchase_date="2026-03-10",
raw_data={"total": 30.00},
)
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
assert result["total"] == Decimal("30.00")
def test_kroger_receipt_all_voided(self):
"""A receipt where every item is voided should have 0 items."""
raw = RawReceipt(
receipt_id="KR-ALLVOID",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{"description": "VOIDED A", "basePrice": 5.0, "voided": True},
{"description": "VOIDED B", "basePrice": 3.0, "status": "VOIDED"},
{"description": "RETURNED C", "basePrice": 7.0, "status": "RETURNED"},
{"description": "RETURNED D", "basePrice": 2.0, "returnFlag": True},
],
"total": 0,
}
},
)
result = parse_kroger_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
def test_meijer_receipt_all_voided(self):
raw = RawReceipt(
receipt_id="MJ-ALLVOID",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{"description": "VOIDED A", "price": 5.0, "voided": True},
{"description": "VOIDED B", "price": 3.0, "status": "VOIDED"},
],
"total": 0,
}
},
)
result = parse_meijer_receipt(raw)
_validate_receipt_schema(result)
assert result["items"] == []
View File
+58
View File
@@ -0,0 +1,58 @@
"""Tests for the base scraper class."""
from datetime import datetime
from unittest.mock import patch
import pytest
from receiptwitness.scrapers.base import BaseScraper, RawReceipt, SessionData
class ConcreteScraper(BaseScraper):
"""Concrete implementation for testing the abstract base."""
async def login(self, username, password):
return SessionData(
cookies=[],
user_agent="test",
created_at=datetime.now(),
)
async def check_session(self, session):
return True
async def scrape_receipts(self, session, since=None):
return []
def parse_receipt(self, raw):
return {}
class TestBaseScraper:
@pytest.mark.asyncio
async def test_human_delay_respects_bounds(self):
scraper = ConcreteScraper()
with patch("receiptwitness.scrapers.base.asyncio.sleep") as mock_sleep:
mock_sleep.return_value = None
await scraper.human_delay(min_ms=100, max_ms=200)
call_args = mock_sleep.call_args[0][0]
assert 0.1 <= call_args <= 0.2
def test_raw_receipt_dataclass(self):
receipt = RawReceipt(
receipt_id="test-123",
purchase_date="2026-03-10",
store_number="42",
raw_data={"key": "value"},
)
assert receipt.receipt_id == "test-123"
assert receipt.raw_data == {"key": "value"}
def test_session_data_defaults(self):
session = SessionData(
cookies=[],
user_agent="test",
created_at=datetime.now(),
)
assert session.expires_at is None
assert session.extra == {}
+574
View File
@@ -0,0 +1,574 @@
"""Tests for the Kroger scraper.
These tests mock Playwright to avoid requiring real Kroger credentials
or network access. They verify the scraper's control flow, session handling,
date filtering, and error resilience.
"""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from receiptwitness.scrapers.base import RawReceipt, SessionData
from receiptwitness.scrapers.kroger import (
DEFAULT_TIMEZONE,
DEFAULT_USER_AGENT,
DEFAULT_VIEWPORT,
KROGER_BASE,
KROGER_LOGIN_PAGE,
KROGER_PURCHASE_HISTORY,
KrogerScraper,
)
@pytest.fixture
def scraper():
return KrogerScraper()
@pytest.fixture
def valid_session():
return SessionData(
cookies=[{"name": "session", "value": "abc123", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=2),
extra={"retailer": "kroger"},
)
@pytest.fixture
def expired_session():
return SessionData(
cookies=[{"name": "session", "value": "expired", "domain": ".kroger.com", "path": "/"}],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC) - timedelta(hours=4),
expires_at=datetime.now(UTC) - timedelta(hours=2),
)
class TestKrogerScraperConstants:
def test_base_url(self):
assert KROGER_BASE == "https://www.kroger.com"
def test_login_page(self):
assert KROGER_LOGIN_PAGE == "https://www.kroger.com/signin"
def test_purchase_history_page(self):
assert KROGER_PURCHASE_HISTORY == "https://www.kroger.com/mypurchases"
def test_default_user_agent_is_chrome(self):
assert "Chrome" in DEFAULT_USER_AGENT
assert "Windows" in DEFAULT_USER_AGENT
def test_default_viewport_hd(self):
assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080}
def test_default_timezone(self):
assert DEFAULT_TIMEZONE == "America/New_York"
class TestCheckSession:
@pytest.mark.asyncio
async def test_expired_session_returns_false(self, scraper, expired_session):
result = await scraper.check_session(expired_session)
assert result is False
@pytest.mark.asyncio
async def test_no_expiry_checks_via_browser(self, scraper):
session = SessionData(
cookies=[],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=None,
)
mock_page = AsyncMock()
mock_page.url = "https://www.kroger.com/account/dashboard"
mock_response = MagicMock()
mock_response.ok = True
mock_page.goto = AsyncMock(return_value=mock_response)
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw:
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
result = await scraper.check_session(session)
assert result is True
@pytest.mark.asyncio
async def test_session_redirected_to_signin_returns_false(self, scraper):
session = SessionData(
cookies=[],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=None,
)
mock_page = AsyncMock()
mock_page.url = "https://www.kroger.com/signin?redirectUrl=account"
mock_response = MagicMock()
mock_response.ok = True
mock_page.goto = AsyncMock(return_value=mock_response)
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw:
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
result = await scraper.check_session(session)
assert result is False
class TestLogin:
@pytest.mark.asyncio
async def test_login_returns_session_data(self, scraper):
mock_page = AsyncMock()
mock_page.url = "https://www.kroger.com/"
# Mock locator chain
mock_email = AsyncMock()
mock_password = AsyncMock()
mock_button = AsyncMock()
mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button])
mock_page.wait_for_url = AsyncMock()
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.cookies = AsyncMock(
return_value=[
{"name": "kroger_session", "value": "test123", "domain": ".kroger.com", "path": "/"}
]
)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
session = await scraper.login("user@test.com", "password123")
assert isinstance(session, SessionData)
assert len(session.cookies) == 1
assert session.cookies[0]["name"] == "kroger_session"
assert session.user_agent == DEFAULT_USER_AGENT
assert session.expires_at is not None
assert session.extra == {"retailer": "kroger"}
class TestScrapeReceipts:
@pytest.mark.asyncio
async def test_scrape_returns_receipts(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.status = 200
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{
"orderId": "KR-001",
"purchaseDate": "2026-03-10T14:00:00Z",
"storeNumber": "357",
},
{
"orderId": "KR-002",
"purchaseDate": "2026-03-11T10:00:00Z",
"storeNumber": "357",
},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={"items": []})
mock_request = AsyncMock()
mock_request.get = AsyncMock(
side_effect=[mock_api_response, mock_detail_response, mock_detail_response]
)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 2
assert receipts[0].receipt_id == "KR-001"
assert receipts[1].receipt_id == "KR-002"
assert isinstance(receipts[0], RawReceipt)
@pytest.mark.asyncio
async def test_scrape_filters_by_date(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{
"orderId": "KR-OLD",
"purchaseDate": "2026-01-01T10:00:00Z",
"storeNumber": "357",
},
{
"orderId": "KR-NEW",
"purchaseDate": "2026-03-15T10:00:00Z",
"storeNumber": "357",
},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
since = datetime(2026, 3, 1, tzinfo=UTC)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session, since=since)
assert len(receipts) == 1
assert receipts[0].receipt_id == "KR-NEW"
@pytest.mark.asyncio
async def test_scrape_handles_api_failure(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 500
mock_api_response.status_text = "Internal Server Error"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
@pytest.mark.asyncio
async def test_scrape_handles_unexpected_response(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(return_value="not a dict")
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
@pytest.mark.asyncio
async def test_scrape_alternative_field_names(self, scraper, valid_session):
"""Kroger may use 'purchases' instead of 'orders'."""
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"purchases": [
{
"receiptId": "KR-ALT-001",
"transactionDate": "2026-03-10T14:00:00Z",
"divisionNumber": "014",
}
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 1
assert receipts[0].receipt_id == "KR-ALT-001"
@pytest.mark.asyncio
async def test_scrape_skips_orders_without_id(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{"purchaseDate": "2026-03-10T14:00:00Z"}, # no id
{"orderId": "KR-VALID", "purchaseDate": "2026-03-10T14:00:00Z"},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 1
assert receipts[0].receipt_id == "KR-VALID"
@pytest.mark.asyncio
async def test_scrape_skips_orders_with_null_id(self, scraper, valid_session):
"""Ensure orderId: null doesn't produce receipt_id='None' (str(None) bug)."""
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"orders": [
{"orderId": None, "receiptId": None, "purchaseDate": "2026-03-10T14:00:00Z"},
{"orderId": "KR-REAL", "purchaseDate": "2026-03-10T14:00:00Z"},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.kroger.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 1
assert receipts[0].receipt_id == "KR-REAL"
# Verify no receipt has the string "None" as its ID
assert all(r.receipt_id != "None" for r in receipts)
class TestParseReceipt:
def test_parse_receipt_delegates_to_parser(self, scraper):
raw = RawReceipt(
receipt_id="KR-001",
purchase_date="2026-03-12",
raw_data={
"detail": {
"items": [
{
"description": "TEST ITEM",
"basePrice": 5.00,
"totalPrice": 5.00,
}
],
"total": 5.00,
}
},
)
result = scraper.parse_receipt(raw)
assert result["receipt_id"] == "KR-001"
assert len(result["items"]) == 1
def test_receipt_detail_failure_returns_empty(self, scraper):
"""Verify receipt detail failures produce empty detail."""
raw = RawReceipt(
receipt_id="KR-FAIL",
purchase_date="2026-03-12",
raw_data={"total": 10.00, "detail": {}},
)
result = scraper.parse_receipt(raw)
assert result["receipt_id"] == "KR-FAIL"
assert result["items"] == []
+585
View File
@@ -0,0 +1,585 @@
"""Tests for the Meijer scraper.
These tests mock Playwright to avoid requiring real Meijer credentials
or network access. They verify the scraper's control flow, session handling,
date filtering, and error resilience.
"""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, MagicMock, patch
import pytest
from receiptwitness.scrapers.base import RawReceipt, SessionData
from receiptwitness.scrapers.meijer import (
DEFAULT_TIMEZONE,
DEFAULT_USER_AGENT,
DEFAULT_VIEWPORT,
MEIJER_BASE,
MEIJER_LOGIN_PAGE,
MEIJER_MPERKS_HOME,
MEIJER_PURCHASE_HISTORY,
MeijerScraper,
)
@pytest.fixture
def scraper():
return MeijerScraper()
@pytest.fixture
def valid_session():
return SessionData(
cookies=[
{"name": "meijer_session", "value": "abc123", "domain": ".meijer.com", "path": "/"}
],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=datetime.now(UTC) + timedelta(hours=4),
)
@pytest.fixture
def expired_session():
return SessionData(
cookies=[
{"name": "meijer_session", "value": "expired", "domain": ".meijer.com", "path": "/"}
],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC) - timedelta(hours=8),
expires_at=datetime.now(UTC) - timedelta(hours=4),
)
class TestMeijerScraperConstants:
def test_base_url(self):
assert MEIJER_BASE == "https://www.meijer.com"
def test_login_page(self):
assert MEIJER_LOGIN_PAGE == "https://www.meijer.com/shopping/login.html"
def test_mperks_home(self):
assert MEIJER_MPERKS_HOME == "https://www.meijer.com/mperks.html"
def test_purchase_history_url(self):
assert (
MEIJER_PURCHASE_HISTORY == "https://www.meijer.com/bin/meijer/profile/purchasehistory"
)
def test_default_user_agent_is_chrome(self):
assert "Chrome" in DEFAULT_USER_AGENT
assert "Windows" in DEFAULT_USER_AGENT
def test_default_viewport_hd(self):
assert DEFAULT_VIEWPORT == {"width": 1920, "height": 1080}
def test_default_timezone(self):
assert DEFAULT_TIMEZONE == "America/Detroit"
class TestCheckSession:
@pytest.mark.asyncio
async def test_expired_session_returns_false(self, scraper, expired_session):
result = await scraper.check_session(expired_session)
assert result is False
@pytest.mark.asyncio
async def test_no_expiry_checks_via_browser(self, scraper):
session = SessionData(
cookies=[],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=None,
)
mock_page = AsyncMock()
mock_page.url = "https://www.meijer.com/mperks.html"
mock_response = MagicMock()
mock_response.ok = True
mock_page.goto = AsyncMock(return_value=mock_response)
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw:
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
result = await scraper.check_session(session)
assert result is True
@pytest.mark.asyncio
async def test_session_redirected_to_login_returns_false(self, scraper):
session = SessionData(
cookies=[],
user_agent=DEFAULT_USER_AGENT,
created_at=datetime.now(UTC),
expires_at=None,
)
mock_page = AsyncMock()
mock_page.url = "https://www.meijer.com/shopping/login.html?redirect=mperks"
mock_response = MagicMock()
mock_response.ok = True
mock_page.goto = AsyncMock(return_value=mock_response)
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw:
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
result = await scraper.check_session(session)
assert result is False
class TestLogin:
@pytest.mark.asyncio
async def test_login_returns_session_data(self, scraper):
mock_page = AsyncMock()
mock_page.url = "https://www.meijer.com/mperks.html"
# Mock locator chain
mock_email = AsyncMock()
mock_password = AsyncMock()
mock_button = AsyncMock()
mock_page.locator = MagicMock(side_effect=[mock_email, mock_password, mock_button])
mock_page.wait_for_url = AsyncMock()
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.cookies = AsyncMock(
return_value=[
{"name": "meijer_session", "value": "test456", "domain": ".meijer.com", "path": "/"}
]
)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
session = await scraper.login("user@test.com", "password123")
assert isinstance(session, SessionData)
assert len(session.cookies) == 1
assert session.cookies[0]["name"] == "meijer_session"
assert session.user_agent == DEFAULT_USER_AGENT
assert session.expires_at is not None
# Meijer sessions last 4 hours
assert session.expires_at > session.created_at + timedelta(hours=3)
class TestScrapeReceipts:
@pytest.mark.asyncio
async def test_scrape_returns_receipts(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.status = 200
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": "TXN-001",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
},
{
"transactionId": "TXN-002",
"transactionDate": "2026-03-11T10:00:00Z",
"storeNumber": "42",
},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={"items": []})
mock_request = AsyncMock()
mock_request.get = AsyncMock(
side_effect=[mock_api_response, mock_detail_response, mock_detail_response]
)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 2
assert receipts[0].receipt_id == "TXN-001"
assert receipts[1].receipt_id == "TXN-002"
assert isinstance(receipts[0], RawReceipt)
@pytest.mark.asyncio
async def test_scrape_filters_by_date(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": "TXN-OLD",
"transactionDate": "2026-01-01T10:00:00Z",
"storeNumber": "42",
},
{
"transactionId": "TXN-NEW",
"transactionDate": "2026-03-15T10:00:00Z",
"storeNumber": "42",
},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
since = datetime(2026, 3, 1, tzinfo=UTC)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session, since=since)
assert len(receipts) == 1
assert receipts[0].receipt_id == "TXN-NEW"
@pytest.mark.asyncio
async def test_scrape_handles_api_failure(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = False
mock_api_response.status = 500
mock_api_response.status_text = "Internal Server Error"
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
@pytest.mark.asyncio
async def test_scrape_handles_unexpected_response(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(return_value="not a dict")
mock_request = AsyncMock()
mock_request.get = AsyncMock(return_value=mock_api_response)
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert receipts == []
@pytest.mark.asyncio
async def test_scrape_alternative_field_names(self, scraper, valid_session):
"""Meijer may use 'purchaseHistory' instead of 'transactions'."""
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"purchaseHistory": [
{
"receiptId": "MJ-ALT-001",
"purchaseDate": "2026-03-10T14:00:00Z",
"storeId": "99",
}
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 1
assert receipts[0].receipt_id == "MJ-ALT-001"
@pytest.mark.asyncio
async def test_scrape_skips_transactions_without_id(self, scraper, valid_session):
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{"transactionDate": "2026-03-10T14:00:00Z"}, # no id
{"transactionId": "TXN-VALID", "transactionDate": "2026-03-10T14:00:00Z"},
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = True
mock_detail_response.json = AsyncMock(return_value={})
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 1
assert receipts[0].receipt_id == "TXN-VALID"
@pytest.mark.asyncio
async def test_scrape_receipt_detail_failure_returns_empty_detail(self, scraper, valid_session):
"""Receipt detail API failure should not crash the scraper."""
mock_api_response = AsyncMock()
mock_api_response.ok = True
mock_api_response.json = AsyncMock(
return_value={
"transactions": [
{
"transactionId": "TXN-DETAIL-FAIL",
"transactionDate": "2026-03-10T14:00:00Z",
"storeNumber": "42",
}
]
}
)
mock_detail_response = AsyncMock()
mock_detail_response.ok = False
mock_detail_response.status = 404
mock_request = AsyncMock()
mock_request.get = AsyncMock(side_effect=[mock_api_response, mock_detail_response])
mock_page = AsyncMock()
mock_page.goto = AsyncMock()
mock_page.request = mock_request
mock_context = AsyncMock()
mock_context.new_page = AsyncMock(return_value=mock_page)
mock_context.add_cookies = AsyncMock()
mock_context.add_init_script = AsyncMock()
mock_browser = AsyncMock()
mock_browser.new_context = AsyncMock(return_value=mock_context)
mock_context.browser = mock_browser
mock_pw = AsyncMock()
mock_pw.chromium.launch = AsyncMock(return_value=mock_browser)
with (
patch("receiptwitness.scrapers.meijer.async_playwright") as mock_apw,
patch.object(scraper, "human_delay", new_callable=AsyncMock),
):
mock_cm = AsyncMock()
mock_cm.__aenter__ = AsyncMock(return_value=mock_pw)
mock_cm.__aexit__ = AsyncMock(return_value=False)
mock_apw.return_value = mock_cm
receipts = await scraper.scrape_receipts(valid_session)
assert len(receipts) == 1
assert receipts[0].receipt_id == "TXN-DETAIL-FAIL"
assert receipts[0].raw_data.get("detail") == {}
class TestParseReceipt:
def test_parse_receipt_delegates_to_parser(self, scraper):
raw = RawReceipt(
receipt_id="TXN-001",
purchase_date="2026-03-10",
raw_data={
"detail": {
"items": [
{
"description": "TEST ITEM",
"price": 5.00,
"extendedPrice": 5.00,
}
],
"total": 5.00,
}
},
)
result = scraper.parse_receipt(raw)
assert result["receipt_id"] == "TXN-001"
assert len(result["items"]) == 1
def test_receipt_detail_failure_returns_empty(self, scraper):
raw = RawReceipt(
receipt_id="TXN-FAIL",
purchase_date="2026-03-10",
raw_data={"total": 10.00, "detail": {}},
)
result = scraper.parse_receipt(raw)
assert result["receipt_id"] == "TXN-FAIL"
assert result["items"] == []
View File
+61
View File
@@ -0,0 +1,61 @@
"""Tests for session encryption/decryption."""
from unittest.mock import patch
import pytest
from cryptography.fernet import Fernet, InvalidToken
from receiptwitness.session.encryption import decrypt_session_data, encrypt_session_data
TEST_KEY = Fernet.generate_key().decode()
@pytest.fixture(autouse=True)
def _mock_encryption_key():
with patch("receiptwitness.session.encryption.settings") as mock_settings:
mock_settings.session_encryption_key = TEST_KEY
yield
class TestEncryptDecrypt:
def test_roundtrip(self):
data = {
"cookies": [{"name": "session", "value": "abc123", "domain": ".meijer.com"}],
"user_agent": "Mozilla/5.0",
}
encrypted = encrypt_session_data(data)
assert isinstance(encrypted, str)
assert encrypted != str(data)
decrypted = decrypt_session_data(encrypted)
assert decrypted == data
def test_different_data_different_ciphertext(self):
data1 = {"key": "value1"}
data2 = {"key": "value2"}
enc1 = encrypt_session_data(data1)
enc2 = encrypt_session_data(data2)
assert enc1 != enc2
def test_decrypt_with_wrong_key_fails(self):
data = {"cookies": []}
encrypted = encrypt_session_data(data)
wrong_key = Fernet.generate_key().decode()
with patch("receiptwitness.session.encryption.settings") as mock_settings:
mock_settings.session_encryption_key = wrong_key
with pytest.raises(InvalidToken):
decrypt_session_data(encrypted)
def test_decrypt_tampered_data_fails(self):
data = {"cookies": []}
encrypted = encrypt_session_data(data)
tampered = encrypted[:-5] + "XXXXX"
with pytest.raises(Exception):
decrypt_session_data(tampered)
def test_no_key_raises_error(self):
with patch("receiptwitness.session.encryption.settings") as mock_settings:
mock_settings.session_encryption_key = ""
with pytest.raises(ValueError, match="RW_SESSION_ENCRYPTION_KEY"):
encrypt_session_data({"test": True})
+102
View File
@@ -0,0 +1,102 @@
"""Tests for session manager logic."""
from datetime import UTC, datetime, timedelta
from unittest.mock import AsyncMock, patch
import pytest
from cryptography.fernet import Fernet
from receiptwitness.scrapers.base import SessionData
from receiptwitness.session.manager import (
get_valid_session,
session_from_db_record,
session_to_db_value,
)
TEST_KEY = Fernet.generate_key().decode()
@pytest.fixture(autouse=True)
def _mock_encryption_key():
with patch("receiptwitness.session.encryption.settings") as mock_settings:
mock_settings.session_encryption_key = TEST_KEY
yield
def _make_session(hours_until_expire: int = 4) -> SessionData:
now = datetime.now(UTC)
return SessionData(
cookies=[{"name": "sid", "value": "test", "domain": ".meijer.com"}],
user_agent="Mozilla/5.0",
created_at=now,
expires_at=now + timedelta(hours=hours_until_expire),
)
class TestSessionSerialization:
def test_roundtrip(self):
session = _make_session()
db_value = session_to_db_value(session)
restored = session_from_db_record(db_value)
assert restored is not None
assert restored.cookies == session.cookies
assert restored.user_agent == session.user_agent
def test_none_returns_none(self):
assert session_from_db_record(None) is None
def test_invalid_encrypted_returns_none(self):
assert session_from_db_record("garbage-data") is None
class TestGetValidSession:
@pytest.mark.asyncio
async def test_valid_existing_session(self):
session = _make_session()
db_value = session_to_db_value(session)
scraper = AsyncMock()
scraper.check_session.return_value = True
result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass")
assert not was_refreshed
assert result.cookies == session.cookies
scraper.login.assert_not_called()
@pytest.mark.asyncio
async def test_expired_session_triggers_login(self):
session = _make_session(hours_until_expire=-1) # already expired
db_value = session_to_db_value(session)
new_session = _make_session()
scraper = AsyncMock()
scraper.login.return_value = new_session
result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass")
assert was_refreshed
scraper.login.assert_called_once_with("user", "pass")
@pytest.mark.asyncio
async def test_no_existing_session_triggers_login(self):
new_session = _make_session()
scraper = AsyncMock()
scraper.login.return_value = new_session
result, was_refreshed = await get_valid_session(scraper, None, "user", "pass")
assert was_refreshed
scraper.login.assert_called_once()
@pytest.mark.asyncio
async def test_failed_session_check_triggers_login(self):
session = _make_session()
db_value = session_to_db_value(session)
new_session = _make_session()
scraper = AsyncMock()
scraper.check_session.return_value = False
scraper.login.return_value = new_session
result, was_refreshed = await get_valid_session(scraper, db_value, "user", "pass")
assert was_refreshed
scraper.login.assert_called_once()