Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Semantic search with Django and pgvector

Semantic search with Django and pgvector

-- Python Pescara 2024

A demonstration on creating a semantic search with Python thanks to the Django web framework and pgvector, the extension for vector storage on PostgreSQL and similarity search.


Paolo Melchiorre

April 05, 2024

More Decks by Paolo Melchiorre

Other Decks in Programming


  1. • PSF, DSF, Python Italia member • PyCon Italia co-organizer

    • Python Pescara organizer • Django contributor • Django Girls coach • Conference speaker Paolo Melchiorre paulox.net © 2022 Bartek Pawlik (CC BY-NC-SA)
  2. 5 Django “The web framework for perfectionists with deadlines.” Project

    start in Lawrence Journal-World (2003) Public release and WSGI (PEP 333) support (2005) Django Software Foundation (2008) Python 3 support (2013) ASGI support (2019) Psycopg 3 support (2022)
  3. 7 """Flask quickstart.""" from flask import Flask app = Flask(__name__)

    @app.route("/") def hello_world(): return "<p>Hello, World!</p>" # https://flask.palletsprojects.com/en/latest/quickstart/
  4. 9 """FastAPI first step.""" from fastapi import FastAPI app =

    FastAPI() @app.get("/") async def root(): return {"message": "Hello World"} # https://fastapi.tiangolo.com/tutorial/first-steps/
  5. """μDjango WSGI example.""" from django import conf, http, urls from

    django.core.handlers import wsgi conf.settings.configure(ROOT_URLCONF=__name__) app = wsgi.WSGIHandler() urlpatterns = [urls.path("", lambda r: http.HttpResponse("🚀"))] # https://github.com/pauloxnet/uDjango 11
  6. """μDjango ASGI example.""" from django import conf, http, urls from

    django.core.handlers import asgi conf.settings.configure(ROOT_URLCONF=__name__) app = asgi.ASGIHandler() async def root(request): return http.JsonResponse({"message": "Hello World"}) urlpatterns = [urls.path("", root)] # https://github.com/pauloxnet/uDjango 12
  7. $ # ------------ $ # Requirements $ # ------------ $

    $ python3 --version Python 3.12.0 $ python3.12 -m venv ~/.venv $ . ~/.venv/bin/activate $ python -m pip install django ... Successfully installed asgiref-3.8 django-5.0 sqlparse-0.4 14
  8. $ # ------------- $ # Start project $ # -------------

    $ $ cd ~/projects $ python -m django startproject semanticsearch $ tree --noreport semanticsearch/ semanticsearch/ ├── manage.py └── semanticsearch ├── asgi.py ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py 15
  9. 16 $ # --------- $ # Start app $ #

    --------- $ $ cd semanticsearch/ $ python -m django startapp items $ tree --noreport items/ items/ ├── admin.py ├── apps.py ├── __init__.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py └── views.py
  10. 18 # semanticsearch/items/models.py from django.db import models class Item(models.Model): content

    = models.TextField() price = models.IntegerField(db_default=10) in_stock = models.BooleanField(db_default=True)
  11. 19 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0001_initial.py - Create model Item $ python -m manage migrate Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK $ python -m manage sqlmigrate items 0001
  12. 20 BEGIN; -- -- Create model Item -- CREATE TABLE

    "items_item" ( "id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "content" text NOT NULL, "price" integer DEFAULT 10 NOT NULL, "in_stock" bool DEFAULT 1 NOT NULL ); COMMIT;
  13. 21 $ python -m manage shell Python 3.12.0 (main, Oct

    4 2023) [GCC 13.2.0] on linux Type "help", "copyright", "credits" or "license" for more info. >>> from items.models import Item >>> Item.objects.filter( ... content__icontains="rock", price=10, in_stock=True ... ).order_by( ... "content" ... ).values_list( ... "content" ... ) <QuerySet ['rocket', 'rock']>
  14. 22 -- -- Django ORM generated SQL from SQLite --

    SELECT "items_item"."content" FROM "items_item" WHERE ( "items_item"."content" LIKE '%rock%' ESCAPE '\' AND "items_item"."in_stock" AND "items_item"."price" = 10 ) ORDER BY "items_item"."content" ASC;
  15. 23 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): list_display = ["content" , "price", "in_stock"] list_filter = ["price", "in_stock"] search_fields = ["content"] show_facets = admin.ShowFacets.ALWAYS
  16. $ # ---------- $ # Run server $ # ----------

    $ $ python -m manage createsuperuser $ python -m manage runserver Watching for file changes with StatReloader Performing system checks... System check identified no issues (0 silenced). April 05, 2024 - 19:30:00 Django version 5.0.4, using settings 'semanticsearch.settings' Starting development server at Quit the server with CONTROL-C. 24
  17. $ # ---------- $ # Psycopg v3 $ # ----------

    $ $ python -m pip install psycopg[binary] ... Successfully installed psycopg-3.1.18 psycopg-binary-3.1.18 typing-extensions-4.10.0 30
  18. 31 # semanticsearch/semanticsearch/settings.py DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql",

    "HOST": "<my_database_host>", "NAME": "<my_database_name", "PASSWORD": "<my_database_password>", "PORT": "<my_database_port>", "USER": "<my_database_user>", } }
  19. 33 $ python -m manage shell Python 3.12.0 (main, Oct

    4 2023) [GCC 13.2.0] on linux Type "help", "copyright", "credits" or "license" for more info. >>> from items.models import Item >>> Item.objects.filter(content__icontains="rocks") <QuerySet []> >>> Item.objects.filter( ... content__search="rocks" ... ).values_list("content", flat=True) <QuerySet ['rock']>
  20. 34 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."content" FROM "items_item" WHERE to_tsvector( COALESCE("items_item"."content", '') ) @@ ( plainto_tsquery('rocks') );
  21. 35 # semanticsearch/items/models.py from django.db import models from django.contrib.postgres import

    search class Item(models.Model): ... vector = models.GeneratedField( db_persist=True, expression=search.SearchVector( "content", config="english" ), output_field=search.SearchVectorField(), )
  22. 36 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0002_item_vector.py - Add field vector to item $ python -m manage migrate Operations to perform: Target specific migration: 0002_item_vector, from items Running migrations: Applying items.0002_item_vector... OK $ python -m manage sqlmigrate items 0002
  23. 37 BEGIN; -- -- Add field vector to item --

    ALTER TABLE "items_item" ADD COLUMN "vector" tsvector GENERATED ALWAYS AS ( to_tsvector('english'::regconfig, COALESCE("content", '')) ) STORED; COMMIT;
  24. “… improve search accuracy by understanding the searcher's intent and

    the contextual meaning of terms …” 38 — Wikipedia “Semantic search”
  25. 39 Embedding System Vector embeddings [ [1,3,4], … ] Data

    📸 📄 📽 🎧 Embedding model 🧮
  26. $ # --------------- $ # pgvector-python $ # --------------- $

    $ python -m pip install pgvector ... Successfully installed pgvector-0.2.5 45
  27. $ # ---------------- $ # vector extension $ # ----------------

    $ $ python -m manage makemigrations --empty –name vector items Migrations for 'items': items/migrations/0003_pgvector.py 46
  28. 47 # items/migrations/0003_pgvector.py from django.db import migrations from pgvector.django import

    VectorExtension class Migration(migrations.Migration): dependencies = [] operations = [VectorExtension()]
  29. 48 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    VectorField class Item(models.Model): ... embedding = VectorField(dimensions=512, editable=False)
  30. 49 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0004_item_embedding.py - Add field vector to item $ python -m manage migrate Operations to perform: Target specific migration: 0004_item_embedding, from items Running migrations: Applying items.0004_item_embedding... OK $ python -m manage sqlmigrate items 0004
  31. 50 BEGIN; -- -- Add field embedding to item --

    ALTER TABLE "items_item" ADD COLUMN "embedding" vector(512) NOT NULL); COMMIT;
  32. 51

  33. $ # --------------------- $ # Sentence Transformers $ # ---------------------

    $ $ python -m pip install sentence-transformers ... Successfully installed ... sentence-transformers-2.6.1 52
  34. 53 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    CosineDistance class Item(models.Model): ... @classmethod def search(cls, q, dmax=0.5): distance = CosineDistance("embedding", T.encode(q)) return ( cls.objects.alias(distance=distance) .filter(distance__lt=dmax) .order_by(distance) )
  35. 54 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."embedding" FROM "items_item" WHERE ("items_item"."embedding" <=> '[...]') < 0.5 ORDER BY ("items_item"."embedding" <=> '[...]') ASC;
  36. 55 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): ... def get_search_results(self, request, ...): queryset, _ = super().get_search_results(request,...) if search_term: queryset |= self.model.search(search_term) return queryset, _