Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] add postgres vectordb #767

Open
wants to merge 15 commits into
base: master
Choose a base branch
from
19 changes: 16 additions & 3 deletions .github/workflows/workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -158,12 +158,20 @@ jobs:
-
name: Create Test Database
run: |
sleep 1
until docker compose exec -T \
store_pgsql pg_isready -U postgres; do sleep 1; done

sleep 1

docker compose exec -T \
store_pgsql \
psql -U postgres -c "create database test_db;"

docker compose exec -T \
store_pgsql \
psql -U postgres -c "create database test_db"
psql -U postgres -d test_db -c "CREATE EXTENSION IF NOT EXISTS vector;"

-
name: Backend Tests
env:
Expand Down Expand Up @@ -434,12 +442,17 @@ jobs:
name: Create Store Database
run: |
cd store
sleep 1
until docker compose exec -T \
store_pgsql pg_isready -U postgres; do sleep 1; done

# docker compose exec -T \
# store_pgsql \
# psql -U postgres -c "create database neurostore"

docker compose exec -T \
store_pgsql \
psql -U postgres -c "create database neurostore"
store_pgsql \
psql -U postgres -d neurostore -c "CREATE EXTENSION IF NOT EXISTS vector;"
-
name: Initialize Compose Database
run: |
Expand Down
2 changes: 2 additions & 0 deletions store/.env.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ ACE_DIR=/media
FILE_DIR=/tmp
COMPOSE_CONVERT_WINDOWS_PATHS=1
POSTGRES_HOST=store_pgsql
POSTGRES_USER=postgres
POSTGRES_DB=neurostore
POSTGRES_PASSWORD=example
BEARERINFO_FUNC=neurostore.resources.auth.decode_token
AUTH0_CLIENT_ID=YOUR_CLIENT_ID
Expand Down
147 changes: 78 additions & 69 deletions store/neurostore/ingest/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -469,9 +469,6 @@ def ace_ingestion_logic(coordinates_df, metadata_df, text_df):
# see if there are duplicates for the newly created base_studies
all_base_studies = []
with db.session.no_autoflush:
all_studies = {
s.pmid: s for s in Study.query.filter_by(source="neurosynth").all()
}
for metadata_row, text_row in zip(
metadata_df.itertuples(), text_df.itertuples()
):
Expand Down Expand Up @@ -561,76 +558,88 @@ def ace_ingestion_logic(coordinates_df, metadata_df, text_df):

# append base study to commit
to_commit.append(base_study)
# keep track of all created/modified base studies
all_base_studies.append(base_study)

s = all_studies.get(pmid, Study())

# try to update the study if information is missing
study_info = {
"name": metadata_row.title,
"doi": doi,
"pmid": pmid,
"description": text_row.abstract,
"authors": metadata_row.authors,
"publication": metadata_row.journal,
"year": year,
"level": "group",
"source": "neurosynth",
}
for col, value in study_info.items():
source_attr = getattr(s, col)
setattr(s, col, source_attr or value)
relevant_studies = base_study.versions
if not relevant_studies:
relevant_studies.append(Study())

analyses = []
points = []
# if all studies have a user,
# add a new study version to incorporate the new data
if all(s.user is not None for s in relevant_studies):
relevant_studies.append(Study())

try:
study_coord_data = coordinates_df.loc[[id_]]
except KeyError:
print(f"pmid: {id_} has no coordinates")
continue
for order, (t_id, df) in enumerate(study_coord_data.groupby("table_id")):
a = (
Analysis.query.filter_by(table_id=str(t_id)).one_or_none()
or Analysis()
)
a.name = df["table_label"][0] or str(t_id)
a.table_id = str(t_id)
a.order = a.order or order
a.description = (
df["table_caption"][0]
if not df["table_caption"].isna()[0]
else None
)
if not a.study:
a.study = s
analyses.append(a)
point_idx = 0
for _, p in df.iterrows():
point = Point(
x=p["x"],
y=p["y"],
z=p["z"],
space=metadata_row.coordinate_space,
kind=(
df["statistic"][0]
if not df["statistic"].isna()[0]
else "unknown"
),
analysis=a,
entities=[Entity(label=a.name, level="group", analysis=a)],
order=point_idx,
for s in relevant_studies:
# try to update the study if information is missing
study_info = {
"name": metadata_row.title,
"doi": doi,
"pmid": pmid,
"description": text_row.abstract,
"authors": metadata_row.authors,
"publication": metadata_row.journal,
"year": year,
"level": "group",
"source": "neurosynth",
}
for col, value in study_info.items():
source_attr = getattr(s, col)
setattr(s, col, source_attr or value)
if s.user is not None:
# do not edit studies that are user owned
continue
analyses = []
points = []

try:
study_coord_data = coordinates_df.loc[[id_]]
except KeyError:
print(f"pmid: {id_} has no coordinates")
continue
for order, (t_id, df) in enumerate(
study_coord_data.groupby("table_id")
):
a = (
Analysis.query.filter_by(table_id=str(t_id)).one_or_none()
or Analysis()
)
points.append(point)
point_idx += 1
to_commit.extend(points)
to_commit.extend(analyses)
# append study as version of study
base_study.versions.append(s)

db.session.add_all(to_commit)
db.session.flush()
for bs in all_base_studies:
bs.update_has_images_and_points()
a.name = df["table_label"][0] or str(t_id)
a.table_id = str(t_id)
a.order = a.order or order
a.description = (
df["table_caption"][0]
if not df["table_caption"].isna()[0]
else None
)
if not a.study:
a.study = s
analyses.append(a)
point_idx = 0
for _, p in df.iterrows():
point = Point(
x=p["x"],
y=p["y"],
z=p["z"],
space=metadata_row.coordinate_space,
kind=(
df["statistic"][0]
if not df["statistic"].isna()[0]
else "unknown"
),
analysis=a,
entities=[Entity(label=a.name, level="group", analysis=a)],
order=point_idx,
)
points.append(point)
point_idx += 1
to_commit.extend(points)
to_commit.extend(analyses)

db.session.add_all(to_commit)
db.session.flush()
for bs in all_base_studies:
bs.update_has_images_and_points()
db.session.commit()


Expand Down
4 changes: 3 additions & 1 deletion store/neurostore/models/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
from sqlalchemy.ext.mutable import MutableDict
from sqlalchemy.orm import relationship, backref
from sqlalchemy.sql import func

import shortuuid

from .migration_types import TSVector
from .migration_types import TSVector, PGVector
from ..database import db


Expand Down Expand Up @@ -178,6 +179,7 @@ class BaseStudy(BaseMixin, db.Model):
has_coordinates = db.Column(db.Boolean, default=False, nullable=False)
has_images = db.Column(db.Boolean, default=False, nullable=False)
user_id = db.Column(db.Text, db.ForeignKey("users.external_id"), index=True)
ada_openai_vector = db.Column(PGVector(1536)) # length of openai ada vector
_ts_vector = db.Column(
"__ts_vector__",
TSVector(),
Expand Down
20 changes: 20 additions & 0 deletions store/neurostore/models/migration_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,25 @@
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import TSVECTOR
from pgvector.sqlalchemy import Vector


class PGVector(sa.types.TypeDecorator):
"""class for semantic search"""

cache_ok = True
impl = Vector

def __init__(self, dim):
super().__init__()
self.impl = Vector(dim)

def process_bind_param(self, value, dialect):
# Ensure the value is of the correct type
return value

def process_result_value(self, value, dialect):
# Ensure the value is returned correctly
return value


class TSVector(sa.types.TypeDecorator):
Expand Down
1 change: 1 addition & 0 deletions store/neurostore/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ gunicorn~=22.0
ipython~=7.19
pandas~=1.2
pip-chill~=1.0
pgvector~=0.2.5
psycopg2-binary~=2.8
pyld~=2.0
python-jose~=3.3
Expand Down
22 changes: 22 additions & 0 deletions store/postgres/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
FROM postgres:12.12

# Install necessary dependencies
RUN apt-get update && apt-get install -y dos2unix
RUN apt-get install -yq python3-pip python-dev build-essential
RUN apt-get install -yq cron
RUN pip3 install awscli

# Install pgvector dependencies
RUN apt-get install -y postgresql-server-dev-12 gcc make curl clang

# Download and install pgvector
RUN curl -L https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.0.tar.gz -o pgvector.tar.gz \
&& tar -xzf pgvector.tar.gz \
&& cd pgvector-0.7.0 \
&& make \
&& make install \
&& rm -rf pgvector.tar.gz pgvector-0.7.0

# Copy initialization script
COPY init-vector-extension.sql /docker-entrypoint-initdb.d/

# Copy scripts and set permissions
COPY pg_dump-to-s3 /home
RUN chmod +x /home/pg_dump-to-s3.sh /home/s3-autodelete.sh

# Set up cron jobs
RUN crontab /home/backup.txt
RUN service cron start

# Convert scripts to Unix format
RUN dos2unix /home/pg_dump-to-s3.sh
RUN dos2unix /home/s3-autodelete.sh
1 change: 1 addition & 0 deletions store/postgres/init-vector-extension.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE EXTENSION IF NOT EXISTS vector;
Loading