fix: replace N+1 UPC query with SQL containment in normalization (#175)

fix: replace N+1 UPC query with SQL containment in normalization
This commit is contained in:
cartsnitch-cto[bot]
2026-04-15 02:00:04 +00:00
committed by GitHub
3 changed files with 62 additions and 12 deletions
@@ -0,0 +1,38 @@
"""Add GIN index on upc_variants and alter column to JSONB.
Revision ID: 009_add_gin_index_upc_variants
Revises: 008_create_domain_tables
Create Date: 2026-04-14
"""
import sqlalchemy as sa
from alembic import op
revision = "009_add_gin_index_upc_variants"
down_revision = "008_create_domain_tables"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.alter_column(
"normalized_products",
"upc_variants",
type_=sa.dialects.postgresql.JSONB(),
postgresql_using="upc_variants::jsonb",
)
op.create_index(
"ix_normalized_products_upc_variants_gin",
"normalized_products",
["upc_variants"],
postgresql_using="gin",
)
def downgrade() -> None:
op.drop_index("ix_normalized_products_upc_variants_gin", table_name="normalized_products")
op.alter_column(
"normalized_products",
"upc_variants",
type_=sa.JSON(),
)
@@ -3,6 +3,7 @@
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
from sqlalchemy import JSON, String from sqlalchemy import JSON, String
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Mapped, mapped_column, relationship from sqlalchemy.orm import Mapped, mapped_column, relationship
from cartsnitch_common.constants import ProductCategory, SizeUnit from cartsnitch_common.constants import ProductCategory, SizeUnit
@@ -26,7 +27,9 @@ class NormalizedProduct(UUIDPrimaryKeyMixin, TimestampMixin, Base):
brand: Mapped[str | None] = mapped_column(String(200)) brand: Mapped[str | None] = mapped_column(String(200))
size: Mapped[str | None] = mapped_column(String(50)) size: Mapped[str | None] = mapped_column(String(50))
size_unit: Mapped[SizeUnit | None] = mapped_column(String(10)) size_unit: Mapped[SizeUnit | None] = mapped_column(String(10))
upc_variants: Mapped[list[str] | None] = mapped_column(JSON, default=list) upc_variants: Mapped[list[str] | None] = mapped_column(
JSON().with_variant(JSONB(), "postgresql"), default=list
)
# Relationships # Relationships
purchase_items: Mapped[list["PurchaseItem"]] = relationship(back_populates="normalized_product") purchase_items: Mapped[list["PurchaseItem"]] = relationship(back_populates="normalized_product")
@@ -5,12 +5,14 @@ Matches products across retailers by:
2. Fuzzy name matching via token-based Jaccard similarity (lower confidence) 2. Fuzzy name matching via token-based Jaccard similarity (lower confidence)
""" """
import json
import re import re
from dataclasses import dataclass from dataclasses import dataclass
from enum import StrEnum from enum import StrEnum
from cartsnitch_common.models.product import NormalizedProduct from cartsnitch_common.models.product import NormalizedProduct
from sqlalchemy import select from sqlalchemy import cast, func, select, String
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
@@ -96,17 +98,24 @@ def jaccard_similarity(a: str, b: str) -> float:
def match_by_upc(session: Session, upc: str) -> MatchResult | None: def match_by_upc(session: Session, upc: str) -> MatchResult | None:
"""Find a normalized product by exact UPC match. """Find a normalized product by exact UPC match.
Loads products with upc_variants and checks membership in Python Uses PostgreSQL JSONB containment (@>) for production efficiency.
for cross-database compatibility (works on both PostgreSQL and SQLite). Falls back to LIKE on SQLite for test compatibility.
""" """
# TODO: Use PostgreSQL JSON containment query (@>) for production. dialect_name = session.bind.dialect.name if session.bind else "default"
# Current approach loads all products into memory — acceptable for tests if dialect_name == "postgresql":
# and small datasets, but will not scale. stmt = select(NormalizedProduct).where(
stmt = select(NormalizedProduct).where(NormalizedProduct.upc_variants.is_not(None)) cast(NormalizedProduct.upc_variants, JSONB).op("@>")(
products = session.execute(stmt).scalars().all() func.cast(json.dumps([upc]), JSONB)
for product in products: )
if product.upc_variants and upc in product.upc_variants: )
return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC) else:
stmt = select(NormalizedProduct).where(
NormalizedProduct.upc_variants.is_not(None),
cast(NormalizedProduct.upc_variants, String).contains(upc),
)
product = session.execute(stmt).scalars().first()
if product:
return MatchResult(product=product, confidence=1.0, method=MatchMethod.UPC)
return None return None