Skip to content

Commit

Permalink
Merge branch 'release/1.0.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
aliavni committed Sep 6, 2020
2 parents e28d46a + 6ec9b95 commit 5cf0fe2
Show file tree
Hide file tree
Showing 14 changed files with 366 additions and 50 deletions.
1 change: 1 addition & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CONNECTION_STRING=postgresql://<USERNAME>:<PASSWORD>@<HOST>/<DATABASE>
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -137,4 +137,6 @@ dmypy.json
# Cython debug symbols
cython_debug/

.DS_Store
.DS_Store

.env
4 changes: 4 additions & 0 deletions .isort.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[settings]
multi_line_output=3
include_trailing_comma=True
line_length=88
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# See https://pre-commit.com for more information
# See https://pre-commit.com/hooks.html for more hooks
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v2.4.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: requirements-txt-fixer
- repo: https://github.com/psf/black
rev: 20.8b1
hooks:
- id: black
- repo: https://github.com/timothycrosley/isort
rev: 5.5.0
hooks:
- id: isort
33 changes: 33 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Drugbank Scraper

## Run

This repo uses `Python 3.8.5`.

1. See [Pipenv & Virtual Environments](https://docs.python-guide.org/dev/virtualenvs/) guide to create a virtual environment and activate the virtual environment
1. Install requirements with `pip install -r requirements.txt`.
1. Create PostgreSQL database.
1. Create .env file with `cp .env.template .env` and fill environment variables.

## Spiders

### Drug

Run `scrapy crawl drug` to run drug spider and populate database. This will scrape data, create and populate database tables. Final data will be in `drugbank` schema. This will:

1. Scrape following data:
* DrugBank ID
* SMILES string
* Gene name
* Actions and alternative identifiers of every target.
2. Save scraped data into the previously created PostgreSQL database.


![Drugbank Schema](static/drugbank_schema.png "Drugbank Schema")


## Development

1. See the virtual environment step above.
1. Install requirements with `pip install -r requirements-dev.txt`.
1. Run `pre-commit install` to install pre-commit hooks. This repo is already set up to use some pre-commit hooks for code quality purposes. Configuration file is available [here](.pre-commit-config.yaml). More information about pre-commit is available on [their website](https://pre-commit.com/).
9 changes: 9 additions & 0 deletions drugbank/exceptions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Scraper exceptions."""


class ItemException(Exception):
pass


class UnknownItemException(ItemException):
pass
35 changes: 26 additions & 9 deletions drugbank/items.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,29 @@
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
"""Models for scraped items."""
from scrapy.item import Field, Item

import scrapy

class DrugItem(Item):
id = Field()
smiles = Field()
scraped_at = Field()

class DrugbankItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass

class TargetItem(Item):
target_id = Field()
drug_id = Field()
gene_name = Field()
scraped_at = Field()


class ActionItem(Item):
target_id = Field()
name = Field()
scraped_at = Field()


class ExternalIdentifierItem(Item):
target_id = Field()
name = Field()
value = Field()
url = Field()
scraped_at = Field()
96 changes: 96 additions & 0 deletions drugbank/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""Sqlalchemy models."""
import os

from dotenv import load_dotenv
from eralchemy import render_er
from sqlalchemy import (
DDL,
Column,
DateTime,
ForeignKeyConstraint,
PrimaryKeyConstraint,
String,
create_engine,
event,
)
from sqlalchemy.ext.declarative import declarative_base

load_dotenv(override=True)
Base = declarative_base()

SCHEMA = "drugbank"


def db_connect():
"""Create database connection and return sqlalchemy engine."""
return create_engine(os.environ.get("CONNECTION_STRING"))


def create_table(engine):
event.listen(
Base.metadata, "before_create", DDL(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
)
Base.metadata.create_all(engine)

render_er(os.environ.get("CONNECTION_STRING"), "static/drugbank_schema.png")


class Drug(Base):
__tablename__ = "drugs"
__table_args__ = (
PrimaryKeyConstraint("id", "scraped_at"),
{"schema": SCHEMA},
)

id = Column(String)
smiles = Column(String)
scraped_at = Column(DateTime)


class Target(Base):
__tablename__ = "targets"

target_id = Column(String)
drug_id = Column(String)
gene_name = Column(String)
scraped_at = Column(DateTime)

__table_args__ = (
PrimaryKeyConstraint("target_id", "scraped_at"),
ForeignKeyConstraint((drug_id, scraped_at), [Drug.id, Drug.scraped_at]),
{"schema": SCHEMA},
)


class Action(Base):
__tablename__ = "actions"

target_id = Column(String)
name = Column(String)
scraped_at = Column(DateTime)

__table_args__ = (
PrimaryKeyConstraint("target_id", "name", "scraped_at"),
ForeignKeyConstraint(
(target_id, scraped_at), [Target.target_id, Target.scraped_at]
),
{"schema": SCHEMA},
)


class ExternalIdentifier(Base):
__tablename__ = "external_identifiers"

target_id = Column(String)
name = Column(String)
value = Column(String)
url = Column(String)
scraped_at = Column(DateTime)

__table_args__ = (
PrimaryKeyConstraint("target_id", "name", "scraped_at"),
ForeignKeyConstraint(
(target_id, scraped_at), [Target.target_id, Target.scraped_at]
),
{"schema": SCHEMA},
)
40 changes: 33 additions & 7 deletions drugbank/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,39 @@
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
from sqlalchemy.orm import sessionmaker


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
from drugbank.exceptions import UnknownItemException
from drugbank.items import ActionItem, DrugItem, ExternalIdentifierItem, TargetItem
from drugbank.models import (
Action,
Drug,
ExternalIdentifier,
Target,
create_table,
db_connect,
)


class DrugbankPipeline:
def __init__(self):
engine = db_connect()
create_table(engine)
self.Session = sessionmaker(bind=engine)
self.session = self.Session()

def process_item(self, item, spider):

if isinstance(item, DrugItem):
db_item = Drug(**item)
elif isinstance(item, ActionItem):
db_item = Action(**item)
elif isinstance(item, TargetItem):
db_item = Target(**item)
elif isinstance(item, ExternalIdentifierItem):
db_item = ExternalIdentifier(**item)
else:
raise UnknownItemException

self.session.add(db_item)
self.session.commit()
self.session.close()

return item
66 changes: 33 additions & 33 deletions drugbank/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,82 +7,82 @@
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'drugbank'

SPIDER_MODULES = ['drugbank.spiders']
NEWSPIDER_MODULE = 'drugbank.spiders'
BOT_NAME = "drugbank"
LOG_LEVEL = "WARNING"
SPIDER_MODULES = ["drugbank.spiders"]
NEWSPIDER_MODULE = "drugbank.spiders"


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'drugbank (+http://www.yourdomain.com)'
# USER_AGENT = 'drugbank (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# CONCURRENT_REQUESTS_PER_DOMAIN = 16
# CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# }

# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# SPIDER_MIDDLEWARES = {
# 'drugbank.middlewares.DrugbankSpiderMiddleware': 543,
#}
# }

# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# DOWNLOADER_MIDDLEWARES = {
# 'drugbank.middlewares.DrugbankDownloaderMiddleware': 543,
#}
# }

# Enable or disable extensions
# See https://docs.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# }

# Configure item pipelines
# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'drugbank.pipelines.DrugbankPipeline': 300,
#}
ITEM_PIPELINES = {
"drugbank.pipelines.DrugbankPipeline": 300,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# HTTPCACHE_ENABLED = True
# HTTPCACHE_EXPIRATION_SECS = 0
# HTTPCACHE_DIR = 'httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES = []
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Loading

0 comments on commit 5cf0fe2

Please sign in to comment.