diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..85077bb --- /dev/null +++ b/.env.template @@ -0,0 +1 @@ +CONNECTION_STRING=postgresql://:@/ diff --git a/.gitignore b/.gitignore index b561863..ff8909b 100644 --- a/.gitignore +++ b/.gitignore @@ -137,4 +137,6 @@ dmypy.json # Cython debug symbols cython_debug/ -.DS_Store \ No newline at end of file +.DS_Store + +.env diff --git a/.isort.cfg b/.isort.cfg new file mode 100644 index 0000000..f985bb0 --- /dev/null +++ b/.isort.cfg @@ -0,0 +1,4 @@ +[settings] +multi_line_output=3 +include_trailing_comma=True +line_length=88 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..34120f1 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,17 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +repos: +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v2.4.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: requirements-txt-fixer +- repo: https://github.com/psf/black + rev: 20.8b1 + hooks: + - id: black +- repo: https://github.com/timothycrosley/isort + rev: 5.5.0 + hooks: + - id: isort diff --git a/README.md b/README.md new file mode 100644 index 0000000..30eb059 --- /dev/null +++ b/README.md @@ -0,0 +1,33 @@ +# Drugbank Scraper + +## Run + +This repo uses `Python 3.8.5`. + +1. See [Pipenv & Virtual Environments](https://docs.python-guide.org/dev/virtualenvs/) guide to create a virtual environment and activate the virtual environment +1. Install requirements with `pip install -r requirements.txt`. +1. Create PostgreSQL database. +1. Create .env file with `cp .env.template .env` and fill environment variables. + +## Spiders + +### Drug + +Run `scrapy crawl drug` to run drug spider and populate database. This will scrape data, create and populate database tables. Final data will be in `drugbank` schema. This will: + +1. Scrape following data: + * DrugBank ID + * SMILES string + * Gene name + * Actions and alternative identifiers of every target. +2. Save scraped data into the previously created PostgreSQL database. + + +![Drugbank Schema](static/drugbank_schema.png "Drugbank Schema") + + +## Development + +1. See the virtual environment step above. +1. Install requirements with `pip install -r requirements-dev.txt`. +1. Run `pre-commit install` to install pre-commit hooks. This repo is already set up to use some pre-commit hooks for code quality purposes. Configuration file is available [here](.pre-commit-config.yaml). More information about pre-commit is available on [their website](https://pre-commit.com/). diff --git a/drugbank/exceptions.py b/drugbank/exceptions.py new file mode 100644 index 0000000..5af7200 --- /dev/null +++ b/drugbank/exceptions.py @@ -0,0 +1,9 @@ +"""Scraper exceptions.""" + + +class ItemException(Exception): + pass + + +class UnknownItemException(ItemException): + pass diff --git a/drugbank/items.py b/drugbank/items.py index 392df28..3b26b38 100644 --- a/drugbank/items.py +++ b/drugbank/items.py @@ -1,12 +1,29 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html +"""Models for scraped items.""" +from scrapy.item import Field, Item -import scrapy +class DrugItem(Item): + id = Field() + smiles = Field() + scraped_at = Field() -class DrugbankItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass + +class TargetItem(Item): + target_id = Field() + drug_id = Field() + gene_name = Field() + scraped_at = Field() + + +class ActionItem(Item): + target_id = Field() + name = Field() + scraped_at = Field() + + +class ExternalIdentifierItem(Item): + target_id = Field() + name = Field() + value = Field() + url = Field() + scraped_at = Field() diff --git a/drugbank/models.py b/drugbank/models.py new file mode 100644 index 0000000..c0078cd --- /dev/null +++ b/drugbank/models.py @@ -0,0 +1,96 @@ +"""Sqlalchemy models.""" +import os + +from dotenv import load_dotenv +from eralchemy import render_er +from sqlalchemy import ( + DDL, + Column, + DateTime, + ForeignKeyConstraint, + PrimaryKeyConstraint, + String, + create_engine, + event, +) +from sqlalchemy.ext.declarative import declarative_base + +load_dotenv(override=True) +Base = declarative_base() + +SCHEMA = "drugbank" + + +def db_connect(): + """Create database connection and return sqlalchemy engine.""" + return create_engine(os.environ.get("CONNECTION_STRING")) + + +def create_table(engine): + event.listen( + Base.metadata, "before_create", DDL(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}") + ) + Base.metadata.create_all(engine) + + render_er(os.environ.get("CONNECTION_STRING"), "static/drugbank_schema.png") + + +class Drug(Base): + __tablename__ = "drugs" + __table_args__ = ( + PrimaryKeyConstraint("id", "scraped_at"), + {"schema": SCHEMA}, + ) + + id = Column(String) + smiles = Column(String) + scraped_at = Column(DateTime) + + +class Target(Base): + __tablename__ = "targets" + + target_id = Column(String) + drug_id = Column(String) + gene_name = Column(String) + scraped_at = Column(DateTime) + + __table_args__ = ( + PrimaryKeyConstraint("target_id", "scraped_at"), + ForeignKeyConstraint((drug_id, scraped_at), [Drug.id, Drug.scraped_at]), + {"schema": SCHEMA}, + ) + + +class Action(Base): + __tablename__ = "actions" + + target_id = Column(String) + name = Column(String) + scraped_at = Column(DateTime) + + __table_args__ = ( + PrimaryKeyConstraint("target_id", "name", "scraped_at"), + ForeignKeyConstraint( + (target_id, scraped_at), [Target.target_id, Target.scraped_at] + ), + {"schema": SCHEMA}, + ) + + +class ExternalIdentifier(Base): + __tablename__ = "external_identifiers" + + target_id = Column(String) + name = Column(String) + value = Column(String) + url = Column(String) + scraped_at = Column(DateTime) + + __table_args__ = ( + PrimaryKeyConstraint("target_id", "name", "scraped_at"), + ForeignKeyConstraint( + (target_id, scraped_at), [Target.target_id, Target.scraped_at] + ), + {"schema": SCHEMA}, + ) diff --git a/drugbank/pipelines.py b/drugbank/pipelines.py index 0298c1d..7b6c84b 100644 --- a/drugbank/pipelines.py +++ b/drugbank/pipelines.py @@ -1,13 +1,39 @@ -# Define your item pipelines here -# -# Don't forget to add your pipeline to the ITEM_PIPELINES setting -# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html +from sqlalchemy.orm import sessionmaker - -# useful for handling different item types with a single interface -from itemadapter import ItemAdapter +from drugbank.exceptions import UnknownItemException +from drugbank.items import ActionItem, DrugItem, ExternalIdentifierItem, TargetItem +from drugbank.models import ( + Action, + Drug, + ExternalIdentifier, + Target, + create_table, + db_connect, +) class DrugbankPipeline: + def __init__(self): + engine = db_connect() + create_table(engine) + self.Session = sessionmaker(bind=engine) + self.session = self.Session() + def process_item(self, item, spider): + + if isinstance(item, DrugItem): + db_item = Drug(**item) + elif isinstance(item, ActionItem): + db_item = Action(**item) + elif isinstance(item, TargetItem): + db_item = Target(**item) + elif isinstance(item, ExternalIdentifierItem): + db_item = ExternalIdentifier(**item) + else: + raise UnknownItemException + + self.session.add(db_item) + self.session.commit() + self.session.close() + return item diff --git a/drugbank/settings.py b/drugbank/settings.py index de4e930..b214da5 100644 --- a/drugbank/settings.py +++ b/drugbank/settings.py @@ -7,82 +7,82 @@ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html # https://docs.scrapy.org/en/latest/topics/spider-middleware.html -BOT_NAME = 'drugbank' - -SPIDER_MODULES = ['drugbank.spiders'] -NEWSPIDER_MODULE = 'drugbank.spiders' +BOT_NAME = "drugbank" +LOG_LEVEL = "WARNING" +SPIDER_MODULES = ["drugbank.spiders"] +NEWSPIDER_MODULE = "drugbank.spiders" # Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = 'drugbank (+http://www.yourdomain.com)' +# USER_AGENT = 'drugbank (+http://www.yourdomain.com)' # Obey robots.txt rules -ROBOTSTXT_OBEY = True +ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) -#CONCURRENT_REQUESTS = 32 +# CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs -#DOWNLOAD_DELAY = 3 +# DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: -#CONCURRENT_REQUESTS_PER_DOMAIN = 16 -#CONCURRENT_REQUESTS_PER_IP = 16 +# CONCURRENT_REQUESTS_PER_DOMAIN = 16 +# CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) -#COOKIES_ENABLED = False +# COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) -#TELNETCONSOLE_ENABLED = False +# TELNETCONSOLE_ENABLED = False # Override the default request headers: -#DEFAULT_REQUEST_HEADERS = { +# DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', -#} +# } # Enable or disable spider middlewares # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html -#SPIDER_MIDDLEWARES = { +# SPIDER_MIDDLEWARES = { # 'drugbank.middlewares.DrugbankSpiderMiddleware': 543, -#} +# } # Enable or disable downloader middlewares # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html -#DOWNLOADER_MIDDLEWARES = { +# DOWNLOADER_MIDDLEWARES = { # 'drugbank.middlewares.DrugbankDownloaderMiddleware': 543, -#} +# } # Enable or disable extensions # See https://docs.scrapy.org/en/latest/topics/extensions.html -#EXTENSIONS = { +# EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, -#} +# } # Configure item pipelines # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html -#ITEM_PIPELINES = { -# 'drugbank.pipelines.DrugbankPipeline': 300, -#} +ITEM_PIPELINES = { + "drugbank.pipelines.DrugbankPipeline": 300, +} # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html -#AUTOTHROTTLE_ENABLED = True +# AUTOTHROTTLE_ENABLED = True # The initial download delay -#AUTOTHROTTLE_START_DELAY = 5 +# AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies -#AUTOTHROTTLE_MAX_DELAY = 60 +# AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server -#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: -#AUTOTHROTTLE_DEBUG = False +# AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = 'httpcache' -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' +# HTTPCACHE_ENABLED = True +# HTTPCACHE_EXPIRATION_SECS = 0 +# HTTPCACHE_DIR = 'httpcache' +# HTTPCACHE_IGNORE_HTTP_CODES = [] +# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/drugbank/spiders/drug.py b/drugbank/spiders/drug.py new file mode 100644 index 0000000..ea052cc --- /dev/null +++ b/drugbank/spiders/drug.py @@ -0,0 +1,103 @@ +from datetime import datetime + +import scrapy + +from drugbank.items import ActionItem, DrugItem, ExternalIdentifierItem, TargetItem + +SCRAPED_AT = datetime.utcnow() + + +class DrugSpider(scrapy.Spider): + name = "drug" + allowed_domains = ["drugbank.ca"] + start_urls = [ + "https://www.drugbank.ca/drugs/DB00619", + "https://www.drugbank.ca/drugs/DB01048", + "https://www.drugbank.ca/drugs/DB14093", + "https://www.drugbank.ca/drugs/DB00173", + "https://www.drugbank.ca/drugs/DB00734", + "https://www.drugbank.ca/drugs/DB00218", + "https://www.drugbank.ca/drugs/DB05196", + "https://www.drugbank.ca/drugs/DB09095", + "https://www.drugbank.ca/drugs/DB01053", + "https://www.drugbank.ca/drugs/DB00274", + ] + + def parse(self, response): + + drug = DrugItem() + + drugbank_id = response.url.split("/")[-1] + drug["id"] = drugbank_id + + # Smiles + drug["smiles"] = response.xpath( + '//dt[@id="smiles"]/following-sibling::dd/div/text()' + ).extract_first() + drug["scraped_at"] = SCRAPED_AT + + yield drug + + # Targets + targets = response.xpath( + '//*[@id="targets"]//div[contains(@class, "bond card")]' + ) + + for target in targets: + target_item = TargetItem() + + target_id = target.xpath("@id").extract_first() + target_item["target_id"] = target_id + target_item["drug_id"] = drugbank_id + + # Gene name + if gene_name_labels := target.xpath('.//dt[@id="gene-name"]'): + gene_name_value = ( + gene_name_labels[0] + .xpath("following-sibling::dd/text()") + .extract_first() + ) + target_item["gene_name"] = gene_name_value + + target_item["scraped_at"] = SCRAPED_AT + yield target_item + + # Actions + if action_labels := target.xpath('.//dt[@id="actions"]'): + actions = action_labels[0].xpath("following-sibling::dd/div") + + _action_items = [] + for action in actions: + _action = ActionItem(target_id=target_id) + _action["name"] = action.xpath("text()").extract_first() + _action["scraped_at"] = SCRAPED_AT + yield _action + + if target_details_link := target.xpath( + './/div[@class="card-header"]/a/@href' + ).get(): + + yield scrapy.Request( + response.urljoin(target_details_link), + callback=self.process_target_details, + meta={"target_item": target_item, "drug": drug}, + ) + + def process_target_details(self, response): + target_item = response.meta["target_item"] + external_identifiers = response.xpath( + '//*[@id="external-identifiers"]/tbody//tr' + ) + + for identifier in external_identifiers: + resource, link = identifier.xpath(".//td") + + item = ExternalIdentifierItem() + item["target_id"] = target_item["target_id"] + item["name"] = resource.xpath("text()").get() + item["value"] = link.xpath("a/text()").get() + item["url"] = link.xpath("a/@href").get() + + item["scraped_at"] = SCRAPED_AT + + yield item diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..f82b7f6 --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,3 @@ +-r requirements.txt + +pre-commit==2.7.1 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1381984 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +ERAlchemy==1.2.10 +psycopg2==2.8.5 +python-dotenv==0.14.0 +Scrapy==2.3.0 +SQLAlchemy==1.3.19 diff --git a/static/drugbank_schema.png b/static/drugbank_schema.png new file mode 100644 index 0000000..421cff0 Binary files /dev/null and b/static/drugbank_schema.png differ