fix: replace N+1 UPC query with SQL containment in normalization

- Add PostgreSQL JSONB containment (@>) query for match_by_upc - Add SQLite LIKE fallback for test compatibility - Update upc_variants column to JSONB with variant for cross-db support - Add GIN index migration for upc_variants Co-Authored-By: Paperclip <noreply@paperclip.ing>
2026-04-14 11:59:28 +00:00
parent da96ec7dc4
commit 24f0dd0e67
3 changed files with 62 additions and 12 deletions
@@ -0,0 +1,38 @@
 """Add GIN index on upc_variants and alter column to JSONB.
 Revision ID: 009_add_gin_index_upc_variants
 Revises: 008_create_domain_tables
 Create Date: 2026-04-14
 """
 import sqlalchemy as sa
 from alembic import op
 revision = "009_add_gin_index_upc_variants"
 down_revision = "008_create_domain_tables"
 branch_labels = None
 depends_on = None
 def upgrade() -> None:
    op.alter_column(
        "normalized_products",
        "upc_variants",
        type_=sa.dialects.postgresql.JSONB(),
        postgresql_using="upc_variants::jsonb",
    )
    op.create_index(
        "ix_normalized_products_upc_variants_gin",
        "normalized_products",
        ["upc_variants"],
        postgresql_using="gin",
    )
 def downgrade() -> None:
    op.drop_index("ix_normalized_products_upc_variants_gin", table_name="normalized_products")
    op.alter_column(
        "normalized_products",
        "upc_variants",
        type_=sa.JSON(),
    )
@@ -3,6 +3,7 @@
 from typing import TYPE_CHECKING
 from sqlalchemy import JSON, String
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import Mapped, mapped_column, relationship
 from cartsnitch_common.constants import ProductCategory, SizeUnit
@@ -26,7 +27,9 @@ class NormalizedProduct(UUIDPrimaryKeyMixin, TimestampMixin, Base):
    brand: Mapped[str | None] = mapped_column(String(200))
    size: Mapped[str | None] = mapped_column(String(50))
    size_unit: Mapped[SizeUnit | None] = mapped_column(String(10))
-    upc_variants: Mapped[list[str] | None] = mapped_column(JSON, default=list)
+    upc_variants: Mapped[list[str] | None] = mapped_column(
        JSON().with_variant(JSONB(), "postgresql"), default=list
    )
    # Relationships
    purchase_items: Mapped[list["PurchaseItem"]] = relationship(back_populates="normalized_product")
@@ -5,12 +5,14 @@ Matches products across retailers by:
 2. Fuzzy name matching via token-based Jaccard similarity (lower confidence)
 """
 import json
 import re
 from dataclasses import dataclass
 from enum import StrEnum
 from cartsnitch_common.models.product import NormalizedProduct
-from sqlalchemy import select
+from sqlalchemy import cast, func, select, String
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import Session
@@ -96,17 +98,24 @@ def jaccard_similarity(a: str, b: str) -> float:
 def match_by_upc(session: Session, upc: str) -> MatchResult | None:
    """Find a normalized product by exact UPC match.
-    Loads products with upc_variants and checks membership in Python
+    Uses PostgreSQL JSONB containment (@>) for production efficiency.
-    for cross-database compatibility (works on both PostgreSQL and SQLite).
+    Falls back to LIKE on SQLite for test compatibility.
    """
-    # TODO: Use PostgreSQL JSON containment query (@>) for production.
+    dialect_name = session.bind.dialect.name if session.bind else "default"
-    # Current approach loads all products into memory — acceptable for tests
+    if dialect_name == "postgresql":
-    # and small datasets, but will not scale.
+        stmt = select(NormalizedProduct).where(
-    stmt = select(NormalizedProduct).where(NormalizedProduct.upc_variants.is_not(None))
+            cast(NormalizedProduct.upc_variants, JSONB).op("@>")(
-    products = session.execute(stmt).scalars().all()
+                func.cast(json.dumps([upc]), JSONB)
-    for product in products:
+            )
-        if product.upc_variants and upc in product.upc_variants:
+        )
-            return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC)
+    else:
        stmt = select(NormalizedProduct).where(
            NormalizedProduct.upc_variants.is_not(None),
            cast(NormalizedProduct.upc_variants, String).contains(upc),
        )
    product = session.execute(stmt).scalars().first()
    if product:
        return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC)
    return None