Upgrade to Pro — share decks privately, control downloads, hide ads and more …

A Pythonic semantic search - PyCon Italia 2025

A Pythonic semantic search - PyCon Italia 2025

-- A talk I gave at PyCon Italia 2025

Keeping in mind the Pythonic principle that “simple is better than complex”, we’ll see how to implement a semantic search in a web service using only an Open-Source AI stack based on Python, Django, PostgreSQL, PGvector, Sentence Transformers.

https://www.paulox.net/2025/05/29/pycon-italia-2025/

Avatar for Paolo Melchiorre

Paolo Melchiorre

May 29, 2025
Tweet

More Decks by Paolo Melchiorre

Other Decks in Technology

Transcript

  1. 🐍 PSF fellow 🦄 DSF director 🧡 Django Girls+ coach

    🚀 Djangonaut Space navigator 󰏢 PyCon Italia organizer 🐬 Python Pescara founder Paolo Melchiorre 🌐 www.paulox.net © 2022 Bartek Pawlik (CC BY-NC-SA)
  2. 4 $ python3 Python 3.13.3 (main, ...) [GCC 14.2.0] on

    linux Type "help", "copyright", "credits" or "license" for more info... >>> import this The Zen of Python, by Tim Peters Beautiful is better than ugly. Explicit is better than implicit. Simple is better than complex. Complex is better than complicated. Flat is better than nested. Sparse is better than dense. Readability counts. ...
  3. """GET https://github.com/wsvincent/django-microframework""" from django import conf, http, urls from django.core.handlers

    import wsgi conf.settings.configure(ROOT_URLCONF=__name__) app = wsgi.WSGIHandler() def hello_world(_): return http.HttpResponse("<p>Hello, World!</p>") urlpatterns = [urls.path("", hello_world)] 8
  4. """GET https://github.com/pauloxnet/uDjango""" from django import conf, http, urls from django.core.handlers

    import asgi conf.settings.configure(ROOT_URLCONF=__name__) app = asgi.ASGIHandler() async def root(request): return http.JsonResponse({"message": "Hello World"}) urlpatterns = [urls.path("", root)] 12
  5. 14 Django “The web framework for perfectionists with deadlines.” 2003

    - Lawrence Journal-World 2005 - Public release & WSGI 2013 - Python 3 2015 - PostgreSQL 2019 - ASGI 2022 - Psycopg 3
  6. $ # "Requirements" $ $ python3 --version Python 3.13.3 $

    $ python3 -m venv ~/.venv $ source ~/.venv/bin/activate $ python -m pip install django ... Successfully installed asgiref-3.8.1 django-5.2.1 sqlparse-0.5.3 15
  7. $ # "Start project" $ $ cd ~/Projects $ python

    -m django startproject semanticsearch $ tree --noreport semanticsearch/ semanticsearch/ ├── manage.py └── semanticsearch ├── asgi.py ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py 16
  8. 17 $ # "Start app" $ $ cd semanticsearch $

    python -m django startapp items $ tree --noreport items/ items/ ├── admin.py ├── apps.py ├── __init__.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py └── views.py
  9. 19 # semanticsearch/items/models.py from django.db import models class Item(models.Model): content

    = models.TextField() price = models.IntegerField(db_default=10) in_stock = models.BooleanField(db_default=True) def __str__(self): return self.content
  10. 20 $ # "Migration" $ $ python -m manage makemigrations

    items Migrations for 'items': items/migrations/0001_initial.py - Create model Item $ python -m manage migrate Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK $ python -m manage sqlmigrate items 0001
  11. 21 BEGIN; -- -- Create model Item -- CREATE TABLE

    "items_item" ( "id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "content" text NOT NULL, "price" integer DEFAULT 10 NOT NULL, "in_stock" bool DEFAULT 1 NOT NULL ); COMMIT;
  12. 22 >>> from items.models import Item >>> items = [Item(content="rock"),

    Item(content="rocket")] >>> Item.objects.bulk_create(items) [<Item: rock>, <Item: rocket>] >>> Item.objects.filter( ... content__icontains="rock", price=10, in_stock=True ... ) <QuerySet [<Item: rock>, <Item: rocket>]>
  13. 23 -- -- Django ORM generated SQL from SQLite --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock" FROM "items_item" WHERE ( "items_item"."content" LIKE '%rock%' ESCAPE '\' AND "items_item"."in_stock" AND "items_item"."price" = 10 );
  14. 24 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): list_display = ["content" , "price", "in_stock"] list_filter = ["price", "in_stock"] search_fields = ["content"] show_facets = admin.ShowFacets.ALWAYS
  15. $ # "Run server" $ $ python -m manage createsuperuser

    $ python -m manage runserver Watching for file changes with StatReloader Performing system checks... System check identified no issues (0 silenced). Django version 5.2.1, using settings 'semanticsearch.settings' Starting development server at http://127.0.0.1:8000/ Quit the server with CONTROL-C. 25
  16. 31 Psycopg “(The) PostgreSQL database adapter for Python.” 2001 -

    psycopg v1 2006 - psycopg2 2011 - Python 3 2013 - JSON 2021 - psycopg v3 2023 - Django support
  17. $ # "Psycopg v3" $ $ python -m pip install

    psycopg[binary] ... Successfully installed psycopg-3.2.9 psycopg-binary-3.2.9 32
  18. 33 # semanticsearch/semanticsearch/settings.py DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql",

    "HOST": "<my_database_host>", "NAME": "<my_database_name>", "PASSWORD": "<my_database_password>", "PORT": "<my_database_port>", "USER": "<my_database_user>", } }
  19. 34 $ # "Migration" $ $ python -m manage migrate

    Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK
  20. 35 >>> from items.models import Item >>> >>> items =

    [Item(content="rock"), Item(content="rocket")] >>> Item.objects.bulk_create(items) [<Item: rock>, <Item: rocket>] >>> >>> Item.objects.filter(content__icontains="rocks") <QuerySet []>
  21. 37 >>> from items.models import Item >>> >>> qs =

    Item.objects.filter(content__search="rocks") >>> qs <QuerySet [<Item: rock>]> >>> >>> import sqlparse >>> print(sqlparse.format(str(qs.query), reindent=True))
  22. 38 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock" FROM "items_item" WHERE to_tsvector(COALESCE("items_item"."content")) @@ plainto_tsquery(rocks)
  23. 39 # semanticsearch/items/models.py from django.db import models from django.contrib.postgres import

    search class Item(models.Model): ... vector = models.GeneratedField( db_persist=True, expression=search.SearchVector("content", config="english"), output_field=search.SearchVectorField(), )
  24. 40 $ # "Migration" $ $ python -m manage makemigrations

    items Migrations for 'items': items/migrations/0002_item_vector.py - Add field vector to item $ $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0002_item_vector... OK $ $ python -m manage sqlmigrate items 0002
  25. 41 BEGIN; -- -- Add field vector to item --

    ALTER TABLE "items_item" ADD COLUMN "vector" tsvector GENERATED ALWAYS AS to_tsvector('english'::regconfig, COALESCE("content", '')) STORED; COMMIT;
  26. 42 >>> from items.models import Item >>> >>> qs =

    Item.objects.filter(vector="rocks") >>> qs <QuerySet [<Item: rock>]> >>> >>> import sqlparse >>> print(sqlparse.format(str(qs.query), reindent=True))
  27. 43 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock", "items_item"."vector" FROM "items_item" WHERE "items_item"."vector" @@ plainto_tsquery(rocks)
  28. “… improve search accuracy by understanding the searcher's intent and

    the contextual meaning of terms …” 44 — Wikipedia “Semantic search”
  29. 45 Embedding System Vector embeddings [ [1,3,4], … ] Data

    📸 📄 📽 🎧 Embedding model 🧮
  30. $ # "pgvector-python" $ $ python -m pip install pgvector

    ... Successfully installed numpy-2.2.6 pgvector-0.4.1 51
  31. $ # "vector extension" $ $ python -m manage makemigrations

    --empty --name vector items Migrations for 'items': items/migrations/0003_pgvector.py 52
  32. 53 # items/migrations/0003_pgvector.py from django.db import migrations from pgvector.django import

    VectorExtension class Migration(migrations.Migration): dependencies = [ ('items', '0002_item_vector'), ] operations = [VectorExtension()]
  33. 54 $ # "Migration" $ $ python -m manage migrate

    items Operations to perform: Target specific migration: items Running migrations: Applying items.0003_pgvector... OK $ $ python -m manage sqlmigrate items 0003
  34. 56 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    VectorField class Item(models.Model): ... embedding = VectorField(dimensions=512, editable=False)
  35. 57 $ # "Migration" $ $ python -m manage makemigrations

    items Migrations for 'items': items/migrations/0004_item_embedding.py - Add field vector to item $ $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0004_item_embedding... OK $ $ python -m manage sqlmigrate items 0004
  36. 58 BEGIN; -- -- Add field embedding to item --

    ALTER TABLE "items_item" ADD COLUMN "embedding" vector(512) NOT NULL; COMMIT;
  37. 59

  38. $ # "Sentence Transformers" $ $ python -m pip install

    sentence-transformers ... Successfully installed … sentence-transformers-4.1.0 … 60
  39. 61 # semanticsearch/items/models.py from django.db import models from sentence_transformers import

    SentenceTransformer T = SentenceTransformer("distiluse-base-multilingual-cased-v1") class Item(models.Model): ... def save(self, *args, **kwargs): self.embedding = T.encode(self.content) super().save(*args, **kwargs)
  40. 62 >>> from items.models import Item >>> for item in

    Item.objects.all(): ... item.save()
  41. 63 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    CosineDistance class Item(models.Model): ... @classmethod def search(cls, q, dmax=0.5): distance = CosineDistance("embedding", T.encode(q)) return ( cls.objects.alias(distance=distance) .filter(distance__lt=dmax) .order_by(distance) )
  42. 65 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock", "items_item"."embedding" FROM "items_item" WHERE ("items_item"."embedding" <=> '[...]') < 0.5 ORDER BY ("items_item"."embedding" <=> '[...]') ASC;
  43. 66 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): ... def get_search_results(self, request, queryset, term): queryset, _ = super().get_search_results( request, queryset, term ) if term: queryset |= self.model.search(term) return queryset, _