Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Semantic search with Django, PostgreSQL, & pgve...

Semantic search with Django, PostgreSQL, & pgvector

-- Posette Conference 2024

A demonstration on creating a semantic search with Python thanks to the Django web framework and pgvector, the extension for vector storage on PostgreSQL and similarity search.

https://www.paulox.net/2024/06/13/posette-2024/

Paolo Melchiorre

June 13, 2024
Tweet

More Decks by Paolo Melchiorre

Other Decks in Technology

Transcript

  1. • PSF, DSF, Python Italia member • PyCon Italia co-organizer

    • Python Pescara organizer • Django contributor • Django Girls coach Paolo Melchiorre paulox.net © 2022 Bartek Pawlik (CC BY-NC-SA)
  2. 5 Django “The web framework for perfectionists with deadlines.” Project

    start in Lawrence Journal-World (2003) Public release and WSGI (PEP 333) support (2005) Python 3 support (2013) PostgreSQL module (2015) ASGI support (2019) Psycopg 3 support (2022)
  3. 7 """Flask quickstart.""" from flask import Flask app = Flask(__name__)

    @app.route("/") def hello_world(): return "<p>Hello, World!</p>" # https://flask.palletsprojects.com/en/latest/quickstart/
  4. 9 """FastAPI first step.""" from fastapi import FastAPI app =

    FastAPI() @app.get("/") async def root(): return {"message": "Hello World"} # https://fastapi.tiangolo.com/tutorial/first-steps/
  5. """μDjango WSGI example.""" from django import conf, http, urls from

    django.core.handlers import wsgi conf.settings.configure(ROOT_URLCONF=__name__) app = wsgi.WSGIHandler() urlpatterns = [urls.path("", lambda r: http.HttpResponse("🚀"))] # https://github.com/pauloxnet/uDjango 11
  6. """μDjango ASGI example.""" from django import conf, http, urls from

    django.core.handlers import asgi conf.settings.configure(ROOT_URLCONF=__name__) app = asgi.ASGIHandler() async def root(request): return http.JsonResponse({"message": "Hello World"}) urlpatterns = [urls.path("", root)] # https://github.com/pauloxnet/uDjango 12
  7. $ # ------------ $ # Requirements $ # ------------ $

    $ python3 --version Python 3.12.3 $ python3.12 -m venv ~/.venv $ . ~/.venv/bin/activate $ python -m pip install django ... Successfully installed asgiref-3.8.1 django-5.0.6 sqlparse-0.5… 14
  8. $ # ------------- $ # Start project $ # -------------

    $ $ cd ~/projects $ python -m django startproject semanticsearch $ tree --noreport semanticsearch/ semanticsearch/ ├── manage.py └── semanticsearch ├── asgi.py ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py 15
  9. 16 $ # --------- $ # Start app $ #

    --------- $ $ cd semanticsearch/ $ python -m django startapp items $ tree --noreport items/ items/ ├── admin.py ├── apps.py ├── __init__.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py └── views.py
  10. 18 # semanticsearch/items/models.py from django.db import models class Item(models.Model): content

    = models.TextField() price = models.IntegerField(db_default=10) in_stock = models.BooleanField(db_default=True) def __str__(self): return self.content
  11. 19 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0001_initial.py - Create model Item $ python -m manage migrate Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK $ python -m manage sqlmigrate items 0001
  12. 20 BEGIN; -- -- Create model Item -- CREATE TABLE

    "items_item" ( "id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "content" text NOT NULL, "price" integer DEFAULT 10 NOT NULL, "in_stock" bool DEFAULT 1 NOT NULL ); COMMIT;
  13. 21 $ python -m manage shell >>> from items.models import

    Item >>> Item.objects.filter( ... content__icontains="rock", price=10, in_stock=True ... ) <QuerySet [<Item: rock>, <Item: rocket>]>
  14. 22 -- -- Django ORM generated SQL from SQLite --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock" FROM "items_item" WHERE ( "items_item"."content" LIKE '%rock%' ESCAPE '\' AND "items_item"."in_stock" AND "items_item"."price" = 10 );
  15. 23 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): list_display = ["content" , "price", "in_stock"] list_filter = ["price", "in_stock"] search_fields = ["content"] show_facets = admin.ShowFacets.ALWAYS
  16. $ # ---------- $ # Run server $ # ----------

    $ $ python -m manage createsuperuser $ python -m manage runserver Watching for file changes with StatReloader Performing system checks... System check identified no issues (0 silenced). Django version 5.0.6, using settings 'semanticsearch.settings' Starting development server at http://127.0.0.1:8000/ Quit the server with CONTROL-C. 24
  17. $ # ---------- $ # Psycopg v3 $ # ----------

    $ $ python -m pip install psycopg[binary] ... Successfully installed psycopg-3.1.19 psycopg-binary-3.1.19 t… 30
  18. 31 # semanticsearch/semanticsearch/settings.py DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql",

    "HOST": "<my_database_host>", "NAME": "<my_database_name>", "PASSWORD": "<my_database_password>", "PORT": "<my_database_port>", "USER": "<my_database_user>", } }
  19. 33 $ python -m manage shell >>> from items.models import

    Item >>> Item.objects.filter(content__icontains="rocks") <QuerySet []> >>> Item.objects.filter(content__search="rocks") <QuerySet [<Item: rock>]>
  20. 34 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock" FROM "items_item" WHERE to_tsvector( COALESCE("items_item"."content", '') ) @@ ( plainto_tsquery('rocks') );
  21. 35 # semanticsearch/items/models.py from django.db import models from django.contrib.postgres import

    search class Item(models.Model): ... vector = models.GeneratedField( db_persist=True, expression=search.SearchVector( "content", config="english" ), output_field=search.SearchVectorField(), )
  22. 36 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0002_item_vector.py - Add field vector to item $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0002_item_vector... OK $ python -m manage sqlmigrate items 0002
  23. 37 BEGIN; -- -- Add field vector to item --

    ALTER TABLE "items_item" ADD COLUMN "vector" tsvector GENERATED ALWAYS AS ( to_tsvector('english'::regconfig, COALESCE("content", '')) ) STORED; COMMIT;
  24. “… improve search accuracy by understanding the searcher's intent and

    the contextual meaning of terms …” 38 — Wikipedia “Semantic search”
  25. 39 Embedding System Vector embeddings [ [1,3,4], … ] Data

    📸 📄 📽 🎧 Embedding model 🧮
  26. $ # --------------- $ # pgvector-python $ # --------------- $

    $ python -m pip install pgvector ... Successfully installed pgvector-0.2.5 45
  27. $ # ---------------- $ # vector extension $ # ----------------

    $ $ python -m manage makemigrations --empty --name vector items Migrations for 'items': items/migrations/0003_pgvector.py 46
  28. 47 # items/migrations/0003_pgvector.py from django.db import migrations from pgvector.django import

    VectorExtension class Migration(migrations.Migration): dependencies = [ ('items', '0002_item_vector'), ] operations = [VectorExtension()]
  29. 48 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0003_pgvector... OK $ python -m manage sqlmigrate items 0003
  30. 50 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    VectorField class Item(models.Model): ... embedding = VectorField(dimensions=512, editable=False)
  31. 51 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0004_item_embedding.py - Add field vector to item $ python -m manage migrate items Operations to perform: Target specific migration: items Running migrations: Applying items.0004_item_embedding... OK $ python -m manage sqlmigrate items 0004
  32. 52 BEGIN; -- -- Add field embedding to item --

    ALTER TABLE "items_item" ADD COLUMN "embedding" vector(512) NOT NULL; COMMIT;
  33. 53

  34. $ # --------------------- $ # Sentence Transformers $ # ---------------------

    $ $ python -m pip install sentence-transformers … Successfully installed … sentence-transformers-2.7.0 … 54
  35. 55 # semanticsearch/items/models.py from django.db import models from sentence_transformers import

    SentenceTransformer T = SentenceTransformer("distiluse-base-multilingual-cased-v1") class Item(models.Model): ... def save(self, *args, **kwargs): self.embedding = T.encode(self.content) super().save(*args, **kwargs)
  36. 56 $ python -m manage shell >>> from items.models import

    Item >>> for item in Item.objects.all(): ... item.save()
  37. 57 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    CosineDistance class Item(models.Model): ... @classmethod def search(cls, q, dmax=0.5): distance = CosineDistance("embedding", T.encode(q)) return ( cls.objects.alias(distance=distance) .filter(distance__lt=dmax) .order_by(distance) )
  38. 58 $ python -m manage shell >>> from items.models import

    Item >>> Item.search("rock") <QuerySet [<Item: rock>, <Item: stone>, <Item: music>, …
  39. 59 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."price", "items_item"."in_stock", "items_item"."embedding" FROM "items_item" WHERE ("items_item"."embedding" <=> '[...]') < 0.5 ORDER BY ("items_item"."embedding" <=> '[...]') ASC;
  40. 60 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): ... def get_search_results(self, request, queryset, term): queryset, _ = super().get_search_results( request, queryset, term ) if term: queryset |= self.model.search(term) return queryset, _