Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

uniref #107

Merged
merged 3 commits into from
Dec 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Adds uniref parser [#107](https://github.com/Koeng101/dnadesign/pull/107)
- Fixes iso-8859-1 error in reading uniref data dumps [#106](https://github.com/Koeng101/dnadesign/pull/106)
- Updates uniprot parser to read IDs [#104](https://github.com/Koeng101/dnadesign/pull/104)
- Fixes RecursiveFragment to not add flanks to the initial input [#102](https://github.com/Koeng101/dnadesign/pull/102)
- Fixes add flank bug, releases new version of python lib [#101](https://github.com/Koeng101/dnadesign/pull/101)
Expand Down
12 changes: 10 additions & 2 deletions lib/bio/bio.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/koeng101/dnadesign/lib/bio/sam"
"github.com/koeng101/dnadesign/lib/bio/slow5"
"github.com/koeng101/dnadesign/lib/bio/uniprot"
"github.com/koeng101/dnadesign/lib/bio/uniref"
)

// Format is a enum of different parser formats.
Expand Down Expand Up @@ -63,12 +64,12 @@ Lower level interfaces

// DataTypes defines the possible data types returned by every parser.
type DataTypes interface {
genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry
genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry | uniref.Entry
}

// HeaderTypes defines the possible header types returned by every parser.
type HeaderTypes interface {
genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header
genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header | uniref.Header
}

// ParserInterface is a generic interface that all parsers must support. It is
Expand Down Expand Up @@ -171,6 +172,13 @@ func NewUniprotParser(r io.Reader) *Parser[uniprot.Entry, uniprot.Header] {
return &Parser[uniprot.Entry, uniprot.Header]{ParserInterface: uniprot.NewParser(r)}
}

// NewUnirefParser initiates a new Uniref parser from an io.Reader. No
// maxLineLength is necessary.
func NewUnirefParser(r io.Reader) (*Parser[uniref.Entry, uniref.Header], error) {
parser, err := uniref.NewParser(r)
return &Parser[uniref.Entry, uniref.Header]{ParserInterface: parser}, err
}

/******************************************************************************

Parser higher-level functions
Expand Down
39 changes: 39 additions & 0 deletions lib/bio/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,45 @@ func ExampleNewUniprotParser() {
// Output: P0C9F0
}

func ExampleNewUnirefParser() {
// The following is the first gene of UniRef50 with the sequence truncated.
// We're going to gzip it and put the gzipped text as an io.Reader to mock
// a file. You can edit the text here to see how the parser works.
//
// Note: Unlike the uniprot parser, the uniref parser expects that the file is
// properly terminated with </UniRef50>.
uniprotEntryText := strings.NewReader(`<?xml version="1.0" encoding="ISO-8859-1" ?>
<UniRef50 xmlns="http://uniprot.org/uniref"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd"
releaseDate="2024-11-27" version="2024_06">
<entry id="UniRef50_UPI002E2621C6" updated="2024-05-29">
<name>Cluster: uncharacterized protein LOC134193701</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Corticium candelabrum"/>
<property type="common taxon ID" value="121492"/>
<representativeMember>
<dbReference type="UniParc ID" id="UPI002E2621C6">
<property type="UniRef100 ID" value="UniRef100_UPI002E2621C6"/>
<property type="UniRef90 ID" value="UniRef90_UPI002E2621C6"/>
<property type="protein name" value="uncharacterized protein LOC134193701"/>
<property type="source organism" value="Corticium candelabrum"/>
<property type="NCBI taxonomy" value="121492"/>
<property type="length" value="49499"/>
<property type="isSeed" value="true"/>
</dbReference>
<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
</representativeMember>
</entry>
</UniRef50>`)
// Now we load the parser, and get the first entry out.
parser, _ := bio.NewUnirefParser(uniprotEntryText)
entry, _ := parser.Next()

fmt.Println(entry.ID)
// Output: UniRef50_UPI002E2621C6
}

func ExampleNewSamParser() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
Expand Down
9 changes: 0 additions & 9 deletions lib/bio/uniprot/uniprot.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ import (
"io"
"net/http"
"net/url"
"strings"
)

// Decoder decodes XML elements2
Expand Down Expand Up @@ -68,14 +67,6 @@ type Parser struct {

func NewParser(r io.Reader) *Parser {
decoder := xml.NewDecoder(r)
// Oddly enough, the uniref datasets use iso-8859-1, not UTF-8. So we need
// to incorporate this decoder charset reader.
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
if strings.ToLower(charset) == "iso-8859-1" {
return input, nil // ISO-8859-1 bytes can be read directly as UTF-8
}
return nil, fmt.Errorf("unsupported charset: %s", charset)
}
return &Parser{decoder: decoder}
}

Expand Down
42 changes: 42 additions & 0 deletions lib/bio/uniref/data/uniref90_mini.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<?xml version="1.0" encoding="ISO-8859-1" ?>
<UniRef50 xmlns="http://uniprot.org/uniref"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd"
releaseDate="2024-11-27" version="2024_06">
<entry id="UniRef50_UPI002E2621C6" updated="2024-05-29">
<name>Cluster: uncharacterized protein LOC134193701</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Corticium candelabrum"/>
<property type="common taxon ID" value="121492"/>
<representativeMember>
<dbReference type="UniParc ID" id="UPI002E2621C6">
<property type="UniRef100 ID" value="UniRef100_UPI002E2621C6"/>
<property type="UniRef90 ID" value="UniRef90_UPI002E2621C6"/>
<property type="protein name" value="uncharacterized protein LOC134193701"/>
<property type="source organism" value="Corticium candelabrum"/>
<property type="NCBI taxonomy" value="121492"/>
<property type="length" value="49499"/>
<property type="isSeed" value="true"/>
</dbReference>
<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
</representativeMember>
</entry>
<entry id="UniRef50_UPI00358F51CD" updated="2024-11-27">
<name>Cluster: LOW QUALITY PROTEIN: titin</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Myxine glutinosa"/>
<property type="common taxon ID" value="7769"/>
<representativeMember>
<dbReference type="UniParc ID" id="UPI00358F51CD">
<property type="UniRef100 ID" value="UniRef100_UPI00358F51CD"/>
<property type="UniRef90 ID" value="UniRef90_UPI00358F51CD"/>
<property type="protein name" value="LOW QUALITY PROTEIN: titin"/>
<property type="source organism" value="Myxine glutinosa"/>
<property type="NCBI taxonomy" value="7769"/>
<property type="length" value="47063"/>
<property type="isSeed" value="true"/>
</dbReference>
<sequence length="47063" checksum="48729625616C010E">MSEQ</sequence>
</representativeMember>
</entry>
</UniRef50>
30 changes: 30 additions & 0 deletions lib/bio/uniref/example_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package uniref_test

import (
"fmt"
"os"
"path/filepath"

"github.com/koeng101/dnadesign/lib/bio/uniref"
)

func Example() {
// Open the gzipped UniRef file
file, _ := os.Open(filepath.Join("data", "uniref90_mini.xml"))
defer file.Close()

// Create new parser
parser, _ := uniref.NewParser(file)

// Read and print the first entry
entry, _ := parser.Next()

fmt.Printf("Entry ID: %s\n", entry.ID)
fmt.Printf("Name: %s\n", entry.Name)
fmt.Printf("Sequence Length: %d\n", entry.RepMember.Sequence.Length)

// Output:
// Entry ID: UniRef50_UPI002E2621C6
// Name: Cluster: uncharacterized protein LOC134193701
// Sequence Length: 49499
}
153 changes: 153 additions & 0 deletions lib/bio/uniref/uniref.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
/*
Package uniref provides a parser for UniRef XML files.

UniRef clusters uniprot proteins by similarity. This is useful for doing
bioinformatics on protein space, as many proteins are sequenced a ton of times
in different organisms, and you don't want those proteins to dominate your
training data.

UniRef data dumps are available as FASTA or XML formatted data. The XML has
more rich data, so we use that. The parser was created using AI.

UniProt Reference Clusters (UniRef) provide clustered sets of sequences from
the UniProt Knowledgebase (including isoforms) and selected UniParc records in
order to obtain complete coverage of the sequence space at several resolutions
while hiding redundant sequences (but not their descriptions) from view.
(taken from uniref reference https://www.uniprot.org/help/uniref)

Download uniref data dumps here: https://www.uniprot.org/downloads

UniRef comes in three formats:
- UniRef100: Clusters of sequences that have 100% sequence identity and same length
- UniRef90: Clusters of sequences with at least 90% sequence identity and 80% overlap
- UniRef50: Clusters of sequences with at least 50% sequence identity and 80% overlap
*/
package uniref

import (
"bytes"
"encoding/xml"
"fmt"
"io"
"strings"
)

// Header is an empty struct since UniRef files don't have headers
type Header struct{}

// Entry represents a UniRef entry
type Entry struct {
XMLName xml.Name `xml:"entry"`
ID string `xml:"id,attr"`
Updated string `xml:"updated,attr"`
Name string `xml:"name"`
Properties []Property `xml:"property"`
RepMember RepresentativeMember `xml:"representativeMember"`
Members []Member `xml:"member"`
}

// Property represents a property element
type Property struct {
Type string `xml:"type,attr"`
Value string `xml:"value,attr"`
}

// DBReference represents a database reference
type DBReference struct {
Type string `xml:"type,attr"`
ID string `xml:"id,attr"`
Properties []Property `xml:"property"`
}

// Sequence represents a sequence element
type Sequence struct {
Length int `xml:"length,attr"`
Checksum string `xml:"checksum,attr"`
Value string `xml:",chardata"`
}

// Member represents a member element
type Member struct {
DBRef DBReference `xml:"dbReference"`
Sequence *Sequence `xml:"sequence"`
}

// RepresentativeMember represents the representative member
type RepresentativeMember Member

// UniRef represents the root element which can be UniRef50, UniRef90, or UniRef100
type UniRef struct {
XMLName xml.Name // This will automatically match the root element name
ReleaseDate string `xml:"releaseDate,attr"`
Version string `xml:"version,attr"`
Entries []Entry `xml:"entry"`
}

// GetUniRefVersion returns "50", "90", or "100" based on the XML root element name
func (u *UniRef) GetUniRefVersion() string {
name := u.XMLName.Local
if strings.HasPrefix(name, "UniRef") {
return strings.TrimPrefix(name, "UniRef")
}
return ""
}

type Parser struct {
decoder *xml.Decoder
uniref *UniRef
current int
}

func NewParser(r io.Reader) (*Parser, error) {
decoder := xml.NewDecoder(r)
decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) {
if strings.ToLower(charset) == "iso-8859-1" {
return input, nil
}
return nil, fmt.Errorf("unsupported charset: %s", charset)
}

return &Parser{
decoder: decoder,
current: -1,
}, nil
}

// Header returns an empty header since UniRef files don't have headers
func (p *Parser) Header() (Header, error) {
return Header{}, nil
}

// Next returns the next Entry from the UniRef file
func (p *Parser) Next() (Entry, error) {
// First time reading
if p.uniref == nil {
p.uniref = &UniRef{}
if err := p.decoder.Decode(p.uniref); err != nil {
return Entry{}, err
}
p.current = 0
}

// Check if we've reached the end of entries
if p.current >= len(p.uniref.Entries) {
return Entry{}, io.EOF
}

// Get current entry and increment counter
entry := p.uniref.Entries[p.current]
p.current++

return entry, nil
}

// ToXML converts an Entry back to its XML representation
func (e *Entry) ToXML() (string, error) {
buf := new(bytes.Buffer)
enc := xml.NewEncoder(buf)
enc.Indent("", " ")
if err := enc.Encode(e); err != nil {
return "", err
}
return buf.String(), nil
}
Loading
Loading