Upgrade to Pro — share decks privately, control downloads, hide ads and more …

Semantic search with Django and pgvector

Semantic search with Django and pgvector

-- Python Pescara 2024

A demonstration on creating a semantic search with Python thanks to the Django web framework and pgvector, the extension for vector storage on PostgreSQL and similarity search.

https://www.paulox.net/2024/04/05/python-pescara-2024-04/

Paolo Melchiorre

April 05, 2024
Tweet

More Decks by Paolo Melchiorre

Other Decks in Programming

Transcript

  1. • PSF, DSF, Python Italia member • PyCon Italia co-organizer

    • Python Pescara organizer • Django contributor • Django Girls coach • Conference speaker Paolo Melchiorre paulox.net © 2022 Bartek Pawlik (CC BY-NC-SA)
  2. 5 Django “The web framework for perfectionists with deadlines.” Project

    start in Lawrence Journal-World (2003) Public release and WSGI (PEP 333) support (2005) Django Software Foundation (2008) Python 3 support (2013) ASGI support (2019) Psycopg 3 support (2022)
  3. 7 """Flask quickstart.""" from flask import Flask app = Flask(__name__)

    @app.route("/") def hello_world(): return "<p>Hello, World!</p>" # https://flask.palletsprojects.com/en/latest/quickstart/
  4. 9 """FastAPI first step.""" from fastapi import FastAPI app =

    FastAPI() @app.get("/") async def root(): return {"message": "Hello World"} # https://fastapi.tiangolo.com/tutorial/first-steps/
  5. """μDjango WSGI example.""" from django import conf, http, urls from

    django.core.handlers import wsgi conf.settings.configure(ROOT_URLCONF=__name__) app = wsgi.WSGIHandler() urlpatterns = [urls.path("", lambda r: http.HttpResponse("🚀"))] # https://github.com/pauloxnet/uDjango 11
  6. """μDjango ASGI example.""" from django import conf, http, urls from

    django.core.handlers import asgi conf.settings.configure(ROOT_URLCONF=__name__) app = asgi.ASGIHandler() async def root(request): return http.JsonResponse({"message": "Hello World"}) urlpatterns = [urls.path("", root)] # https://github.com/pauloxnet/uDjango 12
  7. $ # ------------ $ # Requirements $ # ------------ $

    $ python3 --version Python 3.12.0 $ python3.12 -m venv ~/.venv $ . ~/.venv/bin/activate $ python -m pip install django ... Successfully installed asgiref-3.8 django-5.0 sqlparse-0.4 14
  8. $ # ------------- $ # Start project $ # -------------

    $ $ cd ~/projects $ python -m django startproject semanticsearch $ tree --noreport semanticsearch/ semanticsearch/ ├── manage.py └── semanticsearch ├── asgi.py ├── __init__.py ├── settings.py ├── urls.py └── wsgi.py 15
  9. 16 $ # --------- $ # Start app $ #

    --------- $ $ cd semanticsearch/ $ python -m django startapp items $ tree --noreport items/ items/ ├── admin.py ├── apps.py ├── __init__.py ├── migrations │ └── __init__.py ├── models.py ├── tests.py └── views.py
  10. 18 # semanticsearch/items/models.py from django.db import models class Item(models.Model): content

    = models.TextField() price = models.IntegerField(db_default=10) in_stock = models.BooleanField(db_default=True)
  11. 19 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0001_initial.py - Create model Item $ python -m manage migrate Operations to perform: Target specific migration: 0001_initial, from items Running migrations: Applying items.0001_initial... OK $ python -m manage sqlmigrate items 0001
  12. 20 BEGIN; -- -- Create model Item -- CREATE TABLE

    "items_item" ( "id" integer NOT NULL PRIMARY KEY AUTOINCREMENT, "content" text NOT NULL, "price" integer DEFAULT 10 NOT NULL, "in_stock" bool DEFAULT 1 NOT NULL ); COMMIT;
  13. 21 $ python -m manage shell Python 3.12.0 (main, Oct

    4 2023) [GCC 13.2.0] on linux Type "help", "copyright", "credits" or "license" for more info. >>> from items.models import Item >>> Item.objects.filter( ... content__icontains="rock", price=10, in_stock=True ... ).order_by( ... "content" ... ).values_list( ... "content" ... ) <QuerySet ['rocket', 'rock']>
  14. 22 -- -- Django ORM generated SQL from SQLite --

    SELECT "items_item"."content" FROM "items_item" WHERE ( "items_item"."content" LIKE '%rock%' ESCAPE '\' AND "items_item"."in_stock" AND "items_item"."price" = 10 ) ORDER BY "items_item"."content" ASC;
  15. 23 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): list_display = ["content" , "price", "in_stock"] list_filter = ["price", "in_stock"] search_fields = ["content"] show_facets = admin.ShowFacets.ALWAYS
  16. $ # ---------- $ # Run server $ # ----------

    $ $ python -m manage createsuperuser $ python -m manage runserver Watching for file changes with StatReloader Performing system checks... System check identified no issues (0 silenced). April 05, 2024 - 19:30:00 Django version 5.0.4, using settings 'semanticsearch.settings' Starting development server at http://127.0.0.1:8000/ Quit the server with CONTROL-C. 24
  17. $ # ---------- $ # Psycopg v3 $ # ----------

    $ $ python -m pip install psycopg[binary] ... Successfully installed psycopg-3.1.18 psycopg-binary-3.1.18 typing-extensions-4.10.0 30
  18. 31 # semanticsearch/semanticsearch/settings.py DATABASES = { "default": { "ENGINE": "django.db.backends.postgresql",

    "HOST": "<my_database_host>", "NAME": "<my_database_name", "PASSWORD": "<my_database_password>", "PORT": "<my_database_port>", "USER": "<my_database_user>", } }
  19. 33 $ python -m manage shell Python 3.12.0 (main, Oct

    4 2023) [GCC 13.2.0] on linux Type "help", "copyright", "credits" or "license" for more info. >>> from items.models import Item >>> Item.objects.filter(content__icontains="rocks") <QuerySet []> >>> Item.objects.filter( ... content__search="rocks" ... ).values_list("content", flat=True) <QuerySet ['rock']>
  20. 34 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."content" FROM "items_item" WHERE to_tsvector( COALESCE("items_item"."content", '') ) @@ ( plainto_tsquery('rocks') );
  21. 35 # semanticsearch/items/models.py from django.db import models from django.contrib.postgres import

    search class Item(models.Model): ... vector = models.GeneratedField( db_persist=True, expression=search.SearchVector( "content", config="english" ), output_field=search.SearchVectorField(), )
  22. 36 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0002_item_vector.py - Add field vector to item $ python -m manage migrate Operations to perform: Target specific migration: 0002_item_vector, from items Running migrations: Applying items.0002_item_vector... OK $ python -m manage sqlmigrate items 0002
  23. 37 BEGIN; -- -- Add field vector to item --

    ALTER TABLE "items_item" ADD COLUMN "vector" tsvector GENERATED ALWAYS AS ( to_tsvector('english'::regconfig, COALESCE("content", '')) ) STORED; COMMIT;
  24. “… improve search accuracy by understanding the searcher's intent and

    the contextual meaning of terms …” 38 — Wikipedia “Semantic search”
  25. 39 Embedding System Vector embeddings [ [1,3,4], … ] Data

    📸 📄 📽 🎧 Embedding model 🧮
  26. $ # --------------- $ # pgvector-python $ # --------------- $

    $ python -m pip install pgvector ... Successfully installed pgvector-0.2.5 45
  27. $ # ---------------- $ # vector extension $ # ----------------

    $ $ python -m manage makemigrations --empty –name vector items Migrations for 'items': items/migrations/0003_pgvector.py 46
  28. 47 # items/migrations/0003_pgvector.py from django.db import migrations from pgvector.django import

    VectorExtension class Migration(migrations.Migration): dependencies = [] operations = [VectorExtension()]
  29. 48 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    VectorField class Item(models.Model): ... embedding = VectorField(dimensions=512, editable=False)
  30. 49 $ # --------- $ # Migration $ # ---------

    $ $ python -m manage makemigrations items Migrations for 'items': items/migrations/0004_item_embedding.py - Add field vector to item $ python -m manage migrate Operations to perform: Target specific migration: 0004_item_embedding, from items Running migrations: Applying items.0004_item_embedding... OK $ python -m manage sqlmigrate items 0004
  31. 50 BEGIN; -- -- Add field embedding to item --

    ALTER TABLE "items_item" ADD COLUMN "embedding" vector(512) NOT NULL); COMMIT;
  32. 51

  33. $ # --------------------- $ # Sentence Transformers $ # ---------------------

    $ $ python -m pip install sentence-transformers ... Successfully installed ... sentence-transformers-2.6.1 52
  34. 53 # semanticsearch/items/models.py from django.db import models from pgvector.django import

    CosineDistance class Item(models.Model): ... @classmethod def search(cls, q, dmax=0.5): distance = CosineDistance("embedding", T.encode(q)) return ( cls.objects.alias(distance=distance) .filter(distance__lt=dmax) .order_by(distance) )
  35. 54 -- -- Django ORM generated SQL from PostgreSQL --

    SELECT "items_item"."id", "items_item"."content", "items_item"."embedding" FROM "items_item" WHERE ("items_item"."embedding" <=> '[...]') < 0.5 ORDER BY ("items_item"."embedding" <=> '[...]') ASC;
  36. 55 # semanticsearch/items/admin.py from django.contrib import admin from items.models import

    Item @admin.register(Item) class ItemAdmin(admin.ModelAdmin): ... def get_search_results(self, request, ...): queryset, _ = super().get_search_results(request,...) if search_term: queryset |= self.model.search(search_term) return queryset, _