-
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
14 changed files
with
366 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
CONNECTION_STRING=postgresql://<USERNAME>:<PASSWORD>@<HOST>/<DATABASE> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -137,4 +137,6 @@ dmypy.json | |
# Cython debug symbols | ||
cython_debug/ | ||
|
||
.DS_Store | ||
.DS_Store | ||
|
||
.env |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
[settings] | ||
multi_line_output=3 | ||
include_trailing_comma=True | ||
line_length=88 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
# See https://pre-commit.com for more information | ||
# See https://pre-commit.com/hooks.html for more hooks | ||
repos: | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v2.4.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: requirements-txt-fixer | ||
- repo: https://github.com/psf/black | ||
rev: 20.8b1 | ||
hooks: | ||
- id: black | ||
- repo: https://github.com/timothycrosley/isort | ||
rev: 5.5.0 | ||
hooks: | ||
- id: isort |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
# Drugbank Scraper | ||
|
||
## Run | ||
|
||
This repo uses `Python 3.8.5`. | ||
|
||
1. See [Pipenv & Virtual Environments](https://docs.python-guide.org/dev/virtualenvs/) guide to create a virtual environment and activate the virtual environment | ||
1. Install requirements with `pip install -r requirements.txt`. | ||
1. Create PostgreSQL database. | ||
1. Create .env file with `cp .env.template .env` and fill environment variables. | ||
|
||
## Spiders | ||
|
||
### Drug | ||
|
||
Run `scrapy crawl drug` to run drug spider and populate database. This will scrape data, create and populate database tables. Final data will be in `drugbank` schema. This will: | ||
|
||
1. Scrape following data: | ||
* DrugBank ID | ||
* SMILES string | ||
* Gene name | ||
* Actions and alternative identifiers of every target. | ||
2. Save scraped data into the previously created PostgreSQL database. | ||
|
||
|
||
![Drugbank Schema](static/drugbank_schema.png "Drugbank Schema") | ||
|
||
|
||
## Development | ||
|
||
1. See the virtual environment step above. | ||
1. Install requirements with `pip install -r requirements-dev.txt`. | ||
1. Run `pre-commit install` to install pre-commit hooks. This repo is already set up to use some pre-commit hooks for code quality purposes. Configuration file is available [here](.pre-commit-config.yaml). More information about pre-commit is available on [their website](https://pre-commit.com/). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
"""Scraper exceptions.""" | ||
|
||
|
||
class ItemException(Exception): | ||
pass | ||
|
||
|
||
class UnknownItemException(ItemException): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,29 @@ | ||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# https://docs.scrapy.org/en/latest/topics/items.html | ||
"""Models for scraped items.""" | ||
from scrapy.item import Field, Item | ||
|
||
import scrapy | ||
|
||
class DrugItem(Item): | ||
id = Field() | ||
smiles = Field() | ||
scraped_at = Field() | ||
|
||
class DrugbankItem(scrapy.Item): | ||
# define the fields for your item here like: | ||
# name = scrapy.Field() | ||
pass | ||
|
||
class TargetItem(Item): | ||
target_id = Field() | ||
drug_id = Field() | ||
gene_name = Field() | ||
scraped_at = Field() | ||
|
||
|
||
class ActionItem(Item): | ||
target_id = Field() | ||
name = Field() | ||
scraped_at = Field() | ||
|
||
|
||
class ExternalIdentifierItem(Item): | ||
target_id = Field() | ||
name = Field() | ||
value = Field() | ||
url = Field() | ||
scraped_at = Field() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
"""Sqlalchemy models.""" | ||
import os | ||
|
||
from dotenv import load_dotenv | ||
from eralchemy import render_er | ||
from sqlalchemy import ( | ||
DDL, | ||
Column, | ||
DateTime, | ||
ForeignKeyConstraint, | ||
PrimaryKeyConstraint, | ||
String, | ||
create_engine, | ||
event, | ||
) | ||
from sqlalchemy.ext.declarative import declarative_base | ||
|
||
load_dotenv(override=True) | ||
Base = declarative_base() | ||
|
||
SCHEMA = "drugbank" | ||
|
||
|
||
def db_connect(): | ||
"""Create database connection and return sqlalchemy engine.""" | ||
return create_engine(os.environ.get("CONNECTION_STRING")) | ||
|
||
|
||
def create_table(engine): | ||
event.listen( | ||
Base.metadata, "before_create", DDL(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}") | ||
) | ||
Base.metadata.create_all(engine) | ||
|
||
render_er(os.environ.get("CONNECTION_STRING"), "static/drugbank_schema.png") | ||
|
||
|
||
class Drug(Base): | ||
__tablename__ = "drugs" | ||
__table_args__ = ( | ||
PrimaryKeyConstraint("id", "scraped_at"), | ||
{"schema": SCHEMA}, | ||
) | ||
|
||
id = Column(String) | ||
smiles = Column(String) | ||
scraped_at = Column(DateTime) | ||
|
||
|
||
class Target(Base): | ||
__tablename__ = "targets" | ||
|
||
target_id = Column(String) | ||
drug_id = Column(String) | ||
gene_name = Column(String) | ||
scraped_at = Column(DateTime) | ||
|
||
__table_args__ = ( | ||
PrimaryKeyConstraint("target_id", "scraped_at"), | ||
ForeignKeyConstraint((drug_id, scraped_at), [Drug.id, Drug.scraped_at]), | ||
{"schema": SCHEMA}, | ||
) | ||
|
||
|
||
class Action(Base): | ||
__tablename__ = "actions" | ||
|
||
target_id = Column(String) | ||
name = Column(String) | ||
scraped_at = Column(DateTime) | ||
|
||
__table_args__ = ( | ||
PrimaryKeyConstraint("target_id", "name", "scraped_at"), | ||
ForeignKeyConstraint( | ||
(target_id, scraped_at), [Target.target_id, Target.scraped_at] | ||
), | ||
{"schema": SCHEMA}, | ||
) | ||
|
||
|
||
class ExternalIdentifier(Base): | ||
__tablename__ = "external_identifiers" | ||
|
||
target_id = Column(String) | ||
name = Column(String) | ||
value = Column(String) | ||
url = Column(String) | ||
scraped_at = Column(DateTime) | ||
|
||
__table_args__ = ( | ||
PrimaryKeyConstraint("target_id", "name", "scraped_at"), | ||
ForeignKeyConstraint( | ||
(target_id, scraped_at), [Target.target_id, Target.scraped_at] | ||
), | ||
{"schema": SCHEMA}, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,39 @@ | ||
# Define your item pipelines here | ||
# | ||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html | ||
from sqlalchemy.orm import sessionmaker | ||
|
||
|
||
# useful for handling different item types with a single interface | ||
from itemadapter import ItemAdapter | ||
from drugbank.exceptions import UnknownItemException | ||
from drugbank.items import ActionItem, DrugItem, ExternalIdentifierItem, TargetItem | ||
from drugbank.models import ( | ||
Action, | ||
Drug, | ||
ExternalIdentifier, | ||
Target, | ||
create_table, | ||
db_connect, | ||
) | ||
|
||
|
||
class DrugbankPipeline: | ||
def __init__(self): | ||
engine = db_connect() | ||
create_table(engine) | ||
self.Session = sessionmaker(bind=engine) | ||
self.session = self.Session() | ||
|
||
def process_item(self, item, spider): | ||
|
||
if isinstance(item, DrugItem): | ||
db_item = Drug(**item) | ||
elif isinstance(item, ActionItem): | ||
db_item = Action(**item) | ||
elif isinstance(item, TargetItem): | ||
db_item = Target(**item) | ||
elif isinstance(item, ExternalIdentifierItem): | ||
db_item = ExternalIdentifier(**item) | ||
else: | ||
raise UnknownItemException | ||
|
||
self.session.add(db_item) | ||
self.session.commit() | ||
self.session.close() | ||
|
||
return item |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.