Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Xtransformer to backend #798

Draft
wants to merge 21 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ FROM python:3.10-slim-bookworm
LABEL org.opencontainers.image.authors="[email protected]"
SHELL ["/bin/bash", "-c"]

ARG optional_dependencies="voikko fasttext nn omikuji yake spacy stwfsa"
ARG optional_dependencies="voikko fasttext nn omikuji yake spacy stwfsa pecos"
ARG POETRY_VIRTUALENVS_CREATE=false

# Install system dependencies needed at runtime:
Expand Down Expand Up @@ -37,6 +37,10 @@ RUN if [[ $optional_dependencies =~ "spacy" ]]; then \
python -m spacy download $model; \
done; \
fi
RUN if [[ $optional_dependencies =~ "pecos" ]]; then \
mkdir /.cache -m a=rwx; \
fi


# Second round of installation with the actual code:
COPY annif /Annif/annif
Expand Down
12 changes: 12 additions & 0 deletions annif/backend/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,17 @@
return tfidf.TFIDFBackend


def _xtransformer() -> Type[AnnifBackend]:
try:
from . import xtransformer

return xtransformer.XTransformerBackend

Check warning on line 96 in annif/backend/__init__.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/__init__.py#L96

Added line #L96 was not covered by tests
except ImportError:
raise ValueError(
"XTransformer not available, not enabling XTransformer backend"
)


def _yake() -> Type[AnnifBackend]:
try:
from . import yake
Expand All @@ -111,6 +122,7 @@
"stwfsa": _stwfsa,
"svc": _svc,
"tfidf": _tfidf,
"xtransformer": _xtransformer,
"yake": _yake,
}

Expand Down
6 changes: 1 addition & 5 deletions annif/backend/fasttext.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,11 +124,7 @@ def _create_model(self, params: dict[str, Any], jobs: int) -> None:
self.info("creating fastText model")
trainpath = os.path.join(self.datadir, self.TRAIN_FILE)
modelpath = os.path.join(self.datadir, self.MODEL_FILE)
params = {
param: self.FASTTEXT_PARAMS[param](val)
for param, val in params.items()
if param in self.FASTTEXT_PARAMS
}
params = annif.util.apply_param_parse_config(self.FASTTEXT_PARAMS, params)
if jobs != 0: # jobs set by user to non-default value
params["thread"] = jobs
self.debug("Model parameters: {}".format(params))
Expand Down
4 changes: 1 addition & 3 deletions annif/backend/omikuji.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,7 @@ def _create_model(self, params: dict[str, Any], jobs: int) -> None:
hyper_param.collapse_every_n_layers = int(params["collapse_every_n_layers"])

self._model = omikuji.Model.train_on_data(train_path, hyper_param, jobs or None)
if os.path.exists(model_path):
shutil.rmtree(model_path)
self._model.save(os.path.join(self.datadir, self.MODEL_FILE))
annif.util.atomic_save_folder(self._model, model_path)

def _train(
self,
Expand Down
8 changes: 2 additions & 6 deletions annif/backend/stwfsa.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import SubjectSuggestion
from annif.util import atomic_save, boolean
from annif.util import apply_param_parse_config, atomic_save, boolean

from . import backend

Expand Down Expand Up @@ -106,11 +106,7 @@ def _train(
jobs: int = 0,
) -> None:
X, y = self._load_data(corpus)
new_params = {
key: self.STWFSA_PARAMETERS[key](val)
for key, val in params.items()
if key in self.STWFSA_PARAMETERS
}
new_params = apply_param_parse_config(self.STWFSA_PARAMETERS, params)
p = StwfsapyPredictor(
graph=self.project.vocab.as_graph(),
langs=frozenset([params["language"]]),
Expand Down
259 changes: 259 additions & 0 deletions annif/backend/xtransformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,259 @@
"""Annif backend using the transformer variant of pecos."""

import logging
import os.path as osp
from sys import stdout
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed
from typing import TYPE_CHECKING, Any
Fixed Show fixed Hide fixed

import numpy as np
import scipy.sparse as sp
from pecos.utils.featurization.text.preprocess import Preprocessor
from pecos.xmc.xtransformer import matcher, model
from pecos.xmc.xtransformer.model import XTransformer
from pecos.xmc.xtransformer.module import MLProblemWithText

Check warning on line 13 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L11-L13

Added lines #L11 - L13 were not covered by tests

from annif.exception import NotInitializedException, NotSupportedException
from annif.suggestion import SuggestionBatch, SubjectSuggestion, vector_to_suggestions
Fixed Show fixed Hide fixed
from annif.util import (

Check warning on line 17 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L15-L17

Added lines #L15 - L17 were not covered by tests
apply_param_parse_config,
atomic_save,
atomic_save_folder,
boolean,
)

from . import backend, mixins

Check warning on line 24 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L24

Added line #L24 was not covered by tests


# if TYPE_CHECKING:
from collections.abc import Iterator

Check warning on line 28 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L28

Added line #L28 was not covered by tests
Fixed Show fixed Hide fixed

from scipy.sparse._csr import csr_matrix

Check warning on line 30 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L30

Added line #L30 was not covered by tests
Fixed Show fixed Hide fixed

from annif.corpus.document import DocumentCorpus

Check warning on line 32 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L32

Added line #L32 was not covered by tests

class XTransformerBackend(mixins.TfidfVectorizerMixin, backend.AnnifBackend):

Check warning on line 34 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L34

Added line #L34 was not covered by tests
"""XTransformer based backend for Annif"""

name = "xtransformer"
needs_subject_index = True

Check warning on line 38 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L37-L38

Added lines #L37 - L38 were not covered by tests

_model = None

Check warning on line 40 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L40

Added line #L40 was not covered by tests

train_X_file = "xtransformer-train-X.npz"
train_y_file = "xtransformer-train-y.npz"
train_txt_file = "xtransformer-train-raw.txt"
model_folder = "xtransformer-model"

Check warning on line 45 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L42-L45

Added lines #L42 - L45 were not covered by tests

PARAM_CONFIG = {

Check warning on line 47 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L47

Added line #L47 was not covered by tests
"min_df": int,
"ngram": int,
"fix_clustering": boolean,
"nr_splits": int,
"min_codes": int,
"max_leaf_size": int,
"imbalanced_ratio": float,
"imbalanced_depth": int,
"max_match_clusters": int,
"do_fine_tune": boolean,
"model_shortcut": str,
"beam_size": int,
"limit": int,
"post_processor": str,
"negative_sampling": str,
"ensemble_method": str,
"threshold": float,
"loss_function": str,
"truncate_length": int,
"hidden_droput_prob": float,
"batch_size": int,
"gradient_accumulation_steps": int,
"learning_rate": float,
"weight_decay": float,
"adam_epsilon": float,
"num_train_epochs": int,
"max_steps": int,
"lr_schedule": str,
"warmup_steps": int,
"logging_steps": int,
"save_steps": int,
"max_active_matching_labels": int,
"max_num_labels_in_gpu": int,
"use_gpu": boolean,
"bootstrap_model": str,
Copy link

@katjakon katjakon Nov 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Regarding my previous comments about hyperparamters: it should be fairly easy to incorporate additonal hyperparameters:
Adding the following lines to PARAM_CONFIG would allow us to make use of the hyperparamters Cp and Cn in the project configurations:

"Cn": float,
"Cp": float,

And similarly for the dict DEFAULT_PARAMETERS:

"Cn": 1.0,
"Cp": 1.0,

Let me know if there are any questions!

}

DEFAULT_PARAMETERS = {

Check warning on line 85 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L85

Added line #L85 was not covered by tests
"min_df": 1,
"ngram": 1,
"fix_clustering": False,
"nr_splits": 16,
"min_codes": None,
"max_leaf_size": 100,
"imbalanced_ratio": 0.0,
"imbalanced_depth": 100,
"max_match_clusters": 32768,
"do_fine_tune": True,
"model_shortcut": "distilbert-base-multilingual-uncased",
"beam_size": 20,
"limit": 100,
"post_processor": "sigmoid",
"negative_sampling": "tfn",
"ensemble_method": "transformer-only",
"threshold": 0.1,
"loss_function": "squared-hinge",
"truncate_length": 128,
"hidden_droput_prob": 0.1,
"batch_size": 32,
"gradient_accumulation_steps": 1,
"learning_rate": 1e-4,
"weight_decay": 0.0,
"adam_epsilon": 1e-8,
"num_train_epochs": 1,
"max_steps": 0,
"lr_schedule": "linear",
"warmup_steps": 0,
"logging_steps": 100,
"save_steps": 1000,
"max_active_matching_labels": None,
"max_num_labels_in_gpu": 65536,
"use_gpu": True,
"bootstrap_model": "linear",
}

def _initialize_model(self):
if self._model is None:
path = osp.join(self.datadir, self.model_folder)
self.debug("loading model from {}".format(path))
if osp.exists(path):
self._model = XTransformer.load(path)

Check warning on line 128 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L123-L128

Added lines #L123 - L128 were not covered by tests
else:
raise NotInitializedException(

Check warning on line 130 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L130

Added line #L130 was not covered by tests
"model {} not found".format(path), backend_id=self.backend_id
)

def initialize(self, parallel: bool = False) -> None:
self.initialize_vectorizer()
self._initialize_model()

Check warning on line 136 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L134-L136

Added lines #L134 - L136 were not covered by tests

def default_params(self):
params = backend.AnnifBackend.DEFAULT_PARAMETERS.copy()
params.update(self.DEFAULT_PARAMETERS)
return params

Check warning on line 141 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L138-L141

Added lines #L138 - L141 were not covered by tests

def _create_train_files(self, veccorpus, corpus):
self.info("creating train file")
Xs = []
ys = []
txt_pth = osp.join(self.datadir, self.train_txt_file)
with open(txt_pth, "w", encoding="utf-8") as txt_file:
for doc, vector in zip(corpus.documents, veccorpus):
subject_set = doc.subject_set
if not (subject_set and doc.text):
continue # noqa
print(" ".join(doc.text.split()), file=txt_file)
Xs.append(sp.csr_matrix(vector, dtype=np.float32).sorted_indices())
ys.append(

Check warning on line 155 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L143-L155

Added lines #L143 - L155 were not covered by tests
sp.csr_matrix(
(
np.ones(len(subject_set)),
(np.zeros(len(subject_set)), [s for s in subject_set]),
),
shape=(1, len(self.project.subjects)),
dtype=np.float32,
).sorted_indices()
)
atomic_save(

Check warning on line 165 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L165

Added line #L165 was not covered by tests
sp.vstack(Xs, format="csr"),
self.datadir,
self.train_X_file,
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
)
atomic_save(

Check warning on line 171 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L171

Added line #L171 was not covered by tests
sp.vstack(ys, format="csr"),
self.datadir,
self.train_y_file,
method=lambda mtrx, target: sp.save_npz(target, mtrx, compressed=True),
)

def _create_model(self, params, jobs):
train_txts = Preprocessor.load_data_from_file(

Check warning on line 179 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L178-L179

Added lines #L178 - L179 were not covered by tests
osp.join(self.datadir, self.train_txt_file),
label_text_path=None,
text_pos=0,
)["corpus"]
train_X = sp.load_npz(osp.join(self.datadir, self.train_X_file))
train_y = sp.load_npz(osp.join(self.datadir, self.train_y_file))
model_path = osp.join(self.datadir, self.model_folder)
new_params = apply_param_parse_config(self.PARAM_CONFIG, self.params)
new_params["only_topk"] = new_params.pop("limit")
train_params = XTransformer.TrainParams.from_dict(

Check warning on line 189 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L184-L189

Added lines #L184 - L189 were not covered by tests
new_params, recursive=True
).to_dict()
pred_params = XTransformer.PredParams.from_dict(

Check warning on line 192 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L192

Added line #L192 was not covered by tests
new_params, recursive=True
).to_dict()

self.info("Start training")

Check warning on line 196 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L196

Added line #L196 was not covered by tests
# enable progress
matcher.LOGGER.setLevel(logging.DEBUG)
matcher.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
model.LOGGER.setLevel(logging.DEBUG)
model.LOGGER.addHandler(logging.StreamHandler(stream=stdout))
self._model = XTransformer.train(

Check warning on line 202 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L198-L202

Added lines #L198 - L202 were not covered by tests
MLProblemWithText(train_txts, train_y, X_feat=train_X),
clustering=None,
val_prob=None,
train_params=train_params,
pred_params=pred_params,
beam_size=int(params["beam_size"]),
steps_scale=None,
label_feat=None,
)
atomic_save_folder(self._model, model_path)

Check warning on line 212 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L212

Added line #L212 was not covered by tests

def _train(

Check warning on line 214 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L214

Added line #L214 was not covered by tests
self,
corpus: DocumentCorpus,
params: dict[str, Any],
jobs: int = 0,
) -> None:
if corpus == "cached":
self.info("Reusing cached training data from previous run.")

Check warning on line 221 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L220-L221

Added lines #L220 - L221 were not covered by tests
else:
if corpus.is_empty():
raise NotSupportedException("Cannot t project with no documents")
input = (doc.text for doc in corpus.documents)
vecparams = {

Check warning on line 226 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L223-L226

Added lines #L223 - L226 were not covered by tests
"min_df": int(params["min_df"]),
"tokenizer": self.project.analyzer.tokenize_words,
"ngram_range": (1, int(params["ngram"])),
}
veccorpus = self.create_vectorizer(input, vecparams)
self._create_train_files(veccorpus, corpus)
self._create_model(params, jobs)

Check warning on line 233 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L231-L233

Added lines #L231 - L233 were not covered by tests

def _suggest_batch(

Check warning on line 235 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L235

Added line #L235 was not covered by tests
self, texts: list[str], params: dict[str, Any]
) -> SuggestionBatch:
vector = self.vectorizer.transform(texts)

Check warning on line 238 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L238

Added line #L238 was not covered by tests

if vector.nnz == 0: # All zero vector, empty result
return list()
new_params = apply_param_parse_config(self.PARAM_CONFIG, params)
prediction = self._model.predict(

Check warning on line 243 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L240-L243

Added lines #L240 - L243 were not covered by tests
texts,
X_feat=vector.sorted_indices(),
batch_size=new_params["batch_size"],
use_gpu=True,
only_top_k=new_params["limit"],
post_processor=new_params["post_processor"],
)
current_batchsize = prediction.get_shape()[0]
batch_result = []
for i in range(current_batchsize):
results = []
row = prediction.getrow(i)
for idx, score in zip(row.indices, row.data):
results.append(SubjectSuggestion(subject_id=idx, score=score))
batch_result.append(results)
return SuggestionBatch.from_sequence(batch_result, self.project.subjects)

Check warning on line 259 in annif/backend/xtransformer.py

View check run for this annotation

Codecov / codecov/patch

annif/backend/xtransformer.py#L251-L259

Added lines #L251 - L259 were not covered by tests
Loading
Loading