Merge branch 'release/1.0.0'

aliavni · Sep 6, 2020 · 5cf0fe2 · 5cf0fe2
2 parents e28d46a + 6ec9b95
commit 5cf0fe2
Show file tree

Hide file tree

Showing 14 changed files with 366 additions and 50 deletions.
diff --git a/.env.template b/.env.template
@@ -0,0 +1 @@
+CONNECTION_STRING=postgresql://<USERNAME>:<PASSWORD>@<HOST>/<DATABASE>
diff --git a/.gitignore b/.gitignore
@@ -137,4 +137,6 @@ dmypy.json
 # Cython debug symbols
 cython_debug/
 
-.DS_Store
+.DS_Store
+
+.env
diff --git a/.isort.cfg b/.isort.cfg
@@ -0,0 +1,4 @@
+[settings]
+multi_line_output=3
+include_trailing_comma=True
+line_length=88
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,17 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v2.4.0
+    hooks:
+    -   id: trailing-whitespace
+    -   id: end-of-file-fixer
+    -   id: requirements-txt-fixer
+-   repo: https://github.com/psf/black
+    rev: 20.8b1
+    hooks:
+    -   id: black
+-   repo: https://github.com/timothycrosley/isort
+    rev: 5.5.0
+    hooks:
+    - id: isort
diff --git a/README.md b/README.md
@@ -0,0 +1,33 @@
+# Drugbank Scraper
+
+## Run
+
+This repo uses `Python 3.8.5`.
+
+1. See [Pipenv & Virtual Environments](https://docs.python-guide.org/dev/virtualenvs/) guide to create a virtual environment and activate the virtual environment
+1. Install requirements with `pip install -r requirements.txt`.
+1. Create PostgreSQL database.
+1. Create .env file with `cp .env.template .env` and fill environment variables.
+
+## Spiders
+
+### Drug
+
+Run `scrapy crawl drug` to run drug spider and populate database. This will scrape data, create and populate database tables. Final data will be in `drugbank` schema. This will:
+
+1. Scrape following data:
+    * DrugBank ID
+    * SMILES string
+    * Gene name
+    * Actions and alternative identifiers of every target.
+2. Save scraped data into the previously created PostgreSQL database.
+
+
+![Drugbank Schema](static/drugbank_schema.png "Drugbank Schema")
+
+
+## Development
+
+1. See the virtual environment step above.
+1. Install requirements with `pip install -r requirements-dev.txt`.
+1. Run `pre-commit install` to install pre-commit hooks. This repo is already set up to use some pre-commit hooks for code quality purposes. Configuration file is available [here](.pre-commit-config.yaml). More information about pre-commit is available on [their website](https://pre-commit.com/).
diff --git a/drugbank/exceptions.py b/drugbank/exceptions.py
@@ -0,0 +1,9 @@
+"""Scraper exceptions."""
+
+
+class ItemException(Exception):
+    pass
+
+
+class UnknownItemException(ItemException):
+    pass
diff --git a/drugbank/items.py b/drugbank/items.py
@@ -1,12 +1,29 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
+"""Models for scraped items."""
+from scrapy.item import Field, Item
 
-import scrapy
 
+class DrugItem(Item):
+    id = Field()
+    smiles = Field()
+    scraped_at = Field()
 
-class DrugbankItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
+
+class TargetItem(Item):
+    target_id = Field()
+    drug_id = Field()
+    gene_name = Field()
+    scraped_at = Field()
+
+
+class ActionItem(Item):
+    target_id = Field()
+    name = Field()
+    scraped_at = Field()
+
+
+class ExternalIdentifierItem(Item):
+    target_id = Field()
+    name = Field()
+    value = Field()
+    url = Field()
+    scraped_at = Field()
diff --git a/drugbank/models.py b/drugbank/models.py
@@ -0,0 +1,96 @@
+"""Sqlalchemy models."""
+import os
+
+from dotenv import load_dotenv
+from eralchemy import render_er
+from sqlalchemy import (
+    DDL,
+    Column,
+    DateTime,
+    ForeignKeyConstraint,
+    PrimaryKeyConstraint,
+    String,
+    create_engine,
+    event,
+)
+from sqlalchemy.ext.declarative import declarative_base
+
+load_dotenv(override=True)
+Base = declarative_base()
+
+SCHEMA = "drugbank"
+
+
+def db_connect():
+    """Create database connection and return sqlalchemy engine."""
+    return create_engine(os.environ.get("CONNECTION_STRING"))
+
+
+def create_table(engine):
+    event.listen(
+        Base.metadata, "before_create", DDL(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
+    )
+    Base.metadata.create_all(engine)
+
+    render_er(os.environ.get("CONNECTION_STRING"), "static/drugbank_schema.png")
+
+
+class Drug(Base):
+    __tablename__ = "drugs"
+    __table_args__ = (
+        PrimaryKeyConstraint("id", "scraped_at"),
+        {"schema": SCHEMA},
+    )
+
+    id = Column(String)
+    smiles = Column(String)
+    scraped_at = Column(DateTime)
+
+
+class Target(Base):
+    __tablename__ = "targets"
+
+    target_id = Column(String)
+    drug_id = Column(String)
+    gene_name = Column(String)
+    scraped_at = Column(DateTime)
+
+    __table_args__ = (
+        PrimaryKeyConstraint("target_id", "scraped_at"),
+        ForeignKeyConstraint((drug_id, scraped_at), [Drug.id, Drug.scraped_at]),
+        {"schema": SCHEMA},
+    )
+
+
+class Action(Base):
+    __tablename__ = "actions"
+
+    target_id = Column(String)
+    name = Column(String)
+    scraped_at = Column(DateTime)
+
+    __table_args__ = (
+        PrimaryKeyConstraint("target_id", "name", "scraped_at"),
+        ForeignKeyConstraint(
+            (target_id, scraped_at), [Target.target_id, Target.scraped_at]
+        ),
+        {"schema": SCHEMA},
+    )
+
+
+class ExternalIdentifier(Base):
+    __tablename__ = "external_identifiers"
+
+    target_id = Column(String)
+    name = Column(String)
+    value = Column(String)
+    url = Column(String)
+    scraped_at = Column(DateTime)
+
+    __table_args__ = (
+        PrimaryKeyConstraint("target_id", "name", "scraped_at"),
+        ForeignKeyConstraint(
+            (target_id, scraped_at), [Target.target_id, Target.scraped_at]
+        ),
+        {"schema": SCHEMA},
+    )
diff --git a/drugbank/pipelines.py b/drugbank/pipelines.py
@@ -1,13 +1,39 @@
-# Define your item pipelines here
-#
-# Don't forget to add your pipeline to the ITEM_PIPELINES setting
-# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
+from sqlalchemy.orm import sessionmaker
 
-
-# useful for handling different item types with a single interface
-from itemadapter import ItemAdapter
+from drugbank.exceptions import UnknownItemException
+from drugbank.items import ActionItem, DrugItem, ExternalIdentifierItem, TargetItem
+from drugbank.models import (
+    Action,
+    Drug,
+    ExternalIdentifier,
+    Target,
+    create_table,
+    db_connect,
+)
 
 
 class DrugbankPipeline:
+    def __init__(self):
+        engine = db_connect()
+        create_table(engine)
+        self.Session = sessionmaker(bind=engine)
+        self.session = self.Session()
+
     def process_item(self, item, spider):
+
+        if isinstance(item, DrugItem):
+            db_item = Drug(**item)
+        elif isinstance(item, ActionItem):
+            db_item = Action(**item)
+        elif isinstance(item, TargetItem):
+            db_item = Target(**item)
+        elif isinstance(item, ExternalIdentifierItem):
+            db_item = ExternalIdentifier(**item)
+        else:
+            raise UnknownItemException
+
+        self.session.add(db_item)
+        self.session.commit()
+        self.session.close()
+
         return item
diff --git a/drugbank/settings.py b/drugbank/settings.py
@@ -7,82 +7,82 @@
 #     https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
 #     https://docs.scrapy.org/en/latest/topics/spider-middleware.html
 
-BOT_NAME = 'drugbank'
-
-SPIDER_MODULES = ['drugbank.spiders']
-NEWSPIDER_MODULE = 'drugbank.spiders'
+BOT_NAME = "drugbank"
+LOG_LEVEL = "WARNING"
+SPIDER_MODULES = ["drugbank.spiders"]
+NEWSPIDER_MODULE = "drugbank.spiders"
 
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = 'drugbank (+http://www.yourdomain.com)'
+# USER_AGENT = 'drugbank (+http://www.yourdomain.com)'
 
 # Obey robots.txt rules
-ROBOTSTXT_OBEY = True
+ROBOTSTXT_OBEY = False
 
 # Configure maximum concurrent requests performed by Scrapy (default: 16)
-#CONCURRENT_REQUESTS = 32
+# CONCURRENT_REQUESTS = 32
 
 # Configure a delay for requests for the same website (default: 0)
 # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
 # See also autothrottle settings and docs
-#DOWNLOAD_DELAY = 3
+# DOWNLOAD_DELAY = 3
 # The download delay setting will honor only one of:
-#CONCURRENT_REQUESTS_PER_DOMAIN = 16
-#CONCURRENT_REQUESTS_PER_IP = 16
+# CONCURRENT_REQUESTS_PER_DOMAIN = 16
+# CONCURRENT_REQUESTS_PER_IP = 16
 
 # Disable cookies (enabled by default)
-#COOKIES_ENABLED = False
+# COOKIES_ENABLED = False
 
 # Disable Telnet Console (enabled by default)
-#TELNETCONSOLE_ENABLED = False
+# TELNETCONSOLE_ENABLED = False
 
 # Override the default request headers:
-#DEFAULT_REQUEST_HEADERS = {
+# DEFAULT_REQUEST_HEADERS = {
 #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 #   'Accept-Language': 'en',
-#}
+# }
 
 # Enable or disable spider middlewares
 # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
-#SPIDER_MIDDLEWARES = {
+# SPIDER_MIDDLEWARES = {
 #    'drugbank.middlewares.DrugbankSpiderMiddleware': 543,
-#}
+# }
 
 # Enable or disable downloader middlewares
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
-#DOWNLOADER_MIDDLEWARES = {
+# DOWNLOADER_MIDDLEWARES = {
 #    'drugbank.middlewares.DrugbankDownloaderMiddleware': 543,
-#}
+# }
 
 # Enable or disable extensions
 # See https://docs.scrapy.org/en/latest/topics/extensions.html
-#EXTENSIONS = {
+# EXTENSIONS = {
 #    'scrapy.extensions.telnet.TelnetConsole': None,
-#}
+# }
 
 # Configure item pipelines
 # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
-#ITEM_PIPELINES = {
-#    'drugbank.pipelines.DrugbankPipeline': 300,
-#}
+ITEM_PIPELINES = {
+    "drugbank.pipelines.DrugbankPipeline": 300,
+}
 
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
-#AUTOTHROTTLE_ENABLED = True
+# AUTOTHROTTLE_ENABLED = True
 # The initial download delay
-#AUTOTHROTTLE_START_DELAY = 5
+# AUTOTHROTTLE_START_DELAY = 5
 # The maximum download delay to be set in case of high latencies
-#AUTOTHROTTLE_MAX_DELAY = 60
+# AUTOTHROTTLE_MAX_DELAY = 60
 # The average number of requests Scrapy should be sending in parallel to
 # each remote server
-#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
 # Enable showing throttling stats for every response received:
-#AUTOTHROTTLE_DEBUG = False
+# AUTOTHROTTLE_DEBUG = False
 
 # Enable and configure HTTP caching (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = 'httpcache'
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+# HTTPCACHE_ENABLED = True
+# HTTPCACHE_EXPIRATION_SECS = 0
+# HTTPCACHE_DIR = 'httpcache'
+# HTTPCACHE_IGNORE_HTTP_CODES = []
+# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		CONNECTION_STRING=postgresql://<USERNAME>:<PASSWORD>@<HOST>/<DATABASE>