diff --git a/docs/library-hashing.md b/docs/library-hashing.md index cb7e54c3..1d5013ff 100644 --- a/docs/library-hashing.md +++ b/docs/library-hashing.md @@ -12,7 +12,7 @@ Hashes make incredibly powerful unique identifiers and with a wide array of hash The golang team is currently figuring out the best way to implement blake3 into the standard library but in the meantime `poly` provides this special function and method wrapper to hash sequences using blake3. This will eventually be deprecated in favor of only using the `GenericSequenceHash()` function and `.Hash()` method wrapper. ```go - // getting our example AnnotatedSequence struct + // getting our example Sequence struct puc19AnnotatedSequence := ReadJSON("data/puc19static.json") // there are two ways to use the blake3 Least Rotation hasher. @@ -21,7 +21,7 @@ The golang team is currently figuring out the best way to implement blake3 into puc19Blake3Hash := puc19AnnotatedSequence.Blake3Hash() fmt.Println(puc19Blake3Hash) - // the second is with the Blake3SequenceHash(annotatedSequence AnnotatedSequence) function. + // the second is with the Blake3SequenceHash(sequence Sequence) function. puc19Blake3Hash = puc19AnnotatedSequence.Blake3Hash() fmt.Println(puc19Blake3Hash) ``` @@ -33,7 +33,7 @@ Again, this will be deprecated in favor of using generic hashing with blake3 in `poly` also provides a generic hashing function and method wrapper for hashing sequences with arbitrary hashing functions that use the golang standard library's hash function interface. Check out this switch statement in the [hash command source code](https://github.com/TimothyStiles/poly/blob/f51ec1c08820394d7cab89a5a4af92d9b803f0a4/commands.go#L261) to see all that `poly` provides in the command line utility alone. ```go - // getting our example AnnotatedSequence struct + // getting our example Sequence struct puc19AnnotatedSequence := ReadJSON("data/puc19static.json") // there are two ways to use the Least Rotation generic hasher. @@ -42,7 +42,7 @@ Again, this will be deprecated in favor of using generic hashing with blake3 in puc19Sha1Hash := puc19AnnotatedSequence.Hash(crypto.SHA1) fmt.Println(puc19Sha1Hash) - // the second is with the GenericSequenceHash() function where you pass an AnnotatedSequence along with a hash function as arguments. + // the second is with the GenericSequenceHash() function where you pass an Sequence along with a hash function as arguments. puc19Sha1Hash = GenericSequenceHash(puc19AnnotatedSequence, crypto.SHA1) fmt.Println(puc19Sha1Hash) ``` diff --git a/docs/library-io.md b/docs/library-io.md index 5703ddbf..eefcd28b 100644 --- a/docs/library-io.md +++ b/docs/library-io.md @@ -3,13 +3,13 @@ id: library-io title: Sequence Input Output --- -At the center of `poly`'s annotated sequence support is the `AnnotatedSequence` struct. Structs are kind of Go's answer to objects in other languages. They provide a way of making custom datatypes and methods for developers to use. More on that [here](https://tour.golang.org/moretypes/2), [here](https://gobyexample.com/methods), and [here](https://www.golang-book.com/books/intro/9). +At the center of `poly`'s annotated sequence support is the `Sequence` struct. Structs are kind of Go's answer to objects in other languages. They provide a way of making custom datatypes and methods for developers to use. More on that [here](https://tour.golang.org/moretypes/2), [here](https://gobyexample.com/methods), and [here](https://www.golang-book.com/books/intro/9). -Anywho. `poly` centers around reading in various annotated sequence formats like genbank, or gff and parsing them into an `AnnotatedSequence` to do stuff with them. Whether that's being written out to JSON or being used by `poly` itself. Here are some examples. +Anywho. `poly` centers around reading in various annotated sequence formats like genbank, or gff and parsing them into an `Sequence` to do stuff with them. Whether that's being written out to JSON or being used by `poly` itself. Here are some examples. ## Readers -For all supported file formats `poly` supports a reader. A reader is a function literally named `ReadJSON(path)`, `ReadGbk(path)`, or `ReadGff(path)` that takes one argument - a filepath where your file is located, and returns an `AnnotatedSequence` struct. +For all supported file formats `poly` supports a reader. A reader is a function literally named `ReadJSON(path)`, `ReadGbk(path)`, or `ReadGff(path)` that takes one argument - a filepath where your file is located, and returns an `Sequence` struct. ```go bsubAnnotatedSequence := ReadGbk("data/bsub.gbk") @@ -17,23 +17,23 @@ For all supported file formats `poly` supports a reader. A reader is a function puc19AnnotatedSequence := ReadJSON("data/puc19static.json") ``` -These AnnotatedSequence structs contain all sorts of goodies but can be broken down into three sub main structs. `AnnotatedSequence.Meta`, `AnnotatedSequence.Features`, and `AnnotatedSequence.Sequence`. +These Sequence structs contain all sorts of goodies but can be broken down into three sub main structs. `Sequence.Meta`, `Sequence.Features`, and `Sequence.Sequence`. > Before we move on with the rest of IO I think it'd be good to go over these sub structs in the next section but of course you can skip to [writers](#writers) if you'd like. -## AnnotatedSequence structs +## Sequence structs -Like I just said these AnnotatedSequence structs contain all sorts of goodies but can be broken down into three main sub structs: +Like I just said these Sequence structs contain all sorts of goodies but can be broken down into three main sub structs: - * [AnnotatedSequence.Meta](#annotatedsequencemeta) - * [AnnotatedSequence.Features](#annotatedsequencefeatures) - * [AnnotatedSequence.Sequence](#annotatedsequencesequence) + * [Sequence.Meta](#annotatedsequencemeta) + * [Sequence.Features](#annotatedsequencefeatures) + * [Sequence.Sequence](#annotatedsequencesequence) -Here's how the AnnotatedSequence struct is actually implemented as of [commit c4fc7e](https://github.com/TimothyStiles/poly/blob/c4fc7e6f6cdbd9e5ed2d8ffdbeb206d1d5a8d720/io.go#L108). +Here's how the Sequence struct is actually implemented as of [commit c4fc7e](https://github.com/TimothyStiles/poly/blob/c4fc7e6f6cdbd9e5ed2d8ffdbeb206d1d5a8d720/io.go#L108). ```go - // AnnotatedSequence holds all sequence information in a single struct. - type AnnotatedSequence struct { + // Sequence holds all sequence information in a single struct. + type Sequence struct { Meta Meta Features []Feature Sequence Sequence @@ -42,11 +42,11 @@ Here's how the AnnotatedSequence struct is actually implemented as of [commit c4 > You can check out the original implementation [here](https://github.com/TimothyStiles/poly/blob/c4fc7e6f6cdbd9e5ed2d8ffdbeb206d1d5a8d720/io.go#L108) but I warn you that this is a snapshot and likely has been updated since last writing. -### AnnotatedSequence.Meta +### Sequence.Meta The Meta substruct contains various meta information about whatever record was parsed. Things like name, version, genbank references, etc. -So if I wanted to get something like the Genbank Accession number for a AnnotatedSequence I'd get it like this: +So if I wanted to get something like the Genbank Accession number for a Sequence I'd get it like this: ```go bsubAnnotatedSequence := ReadGbk("data/bsub.gbk") @@ -68,7 +68,7 @@ Same goes for a lot of other stuff: Here's how the Meta struct is actually implemented in [commit c4fc7e](https://github.com/TimothyStiles/poly/blob/c4fc7e6f6cdbd9e5ed2d8ffdbeb206d1d5a8d720/io.go#L34) which is the latest as of writing. ```go - // Meta Holds all the meta information of an AnnotatedSequence struct. + // Meta Holds all the meta information of an Sequence struct. type Meta struct { Name string GffVersion string @@ -93,9 +93,9 @@ Here's how the Meta struct is actually implemented in [commit c4fc7e](https://gi You'll notice that there are actually three more substructs towards the bottom. They hold extra genbank specific information that's handy to have grouped together. More about how genbank files are structered can be found [here](https://www.ncbi.nlm.nih.gov/Sitemap/samplerecord.html). -### AnnotatedSequence.Features +### Sequence.Features -The `Features` substruct is actually a slice (golang term for what is essentially a dynamic length list) of `Feature` structs that can be iterated through. For example if you wanted to iterate through an `AnnotatedSequence`'s features and get their name (i.e GFP) and type (i.e CDS) you'd do it like this. +The `Features` substruct is actually a slice (golang term for what is essentially a dynamic length list) of `Feature` structs that can be iterated through. For example if you wanted to iterate through an `Sequence`'s features and get their name (i.e GFP) and type (i.e CDS) you'd do it like this. ```go bsubAnnotatedSequence := ReadGbk("data/bsub.gbk") @@ -106,12 +106,12 @@ The `Features` substruct is actually a slice (golang term for what is essentiall The `Feature` struct has about 10 or so fields which you can learn more about from this section in [commit c4fc7e](https://github.com/TimothyStiles/poly/blob/c4fc7e6f6cdbd9e5ed2d8ffdbeb206d1d5a8d720/io.go#L80). -### AnnotatedSequence.Sequence +### Sequence.Sequence -The AnnotatedSequence Sequence substruct is by far the most basic and critical. Without it well, you ain't go no DNA. The substruct itself has 4 simple fields. +The Sequence Sequence substruct is by far the most basic and critical. Without it well, you ain't go no DNA. The substruct itself has 4 simple fields. ```go - // Sequence holds raw sequence information in an AnnotatedSequence struct. + // Sequence holds raw sequence information in an Sequence struct. type Sequence struct { Description string Hash string @@ -122,7 +122,7 @@ The AnnotatedSequence Sequence substruct is by far the most basic and critical. The `Description`, `Hash`, and `HashFunction` are at all identifying fields of the Sequence string. The `Description` is the same kind of short description you'd find in a `fasta` or `fastq` file. The `Hash` and `HashFunction` are used to create a unique identifier specify to the sequence string which you'll learn more about in the next chapter on sequence hashing. -To get an AnnotatedSequence sequence you can address it like so: +To get an Sequence sequence you can address it like so: ```go bsubAnnotatedSequence := ReadGbk("data/bsub.gbk") @@ -133,10 +133,10 @@ To get an AnnotatedSequence sequence you can address it like so: `poly` tries to supply a writer for all supported file formats that have a reader. - Writers take two arguments. The first is an AnnotatedSequence struct, the second is a path to write out to. + Writers take two arguments. The first is an Sequence struct, the second is a path to write out to. ```go - // getting AnnotatedSequence(s) to write out again. + // getting Sequence(s) to write out again. bsubAnnotatedSequence := ReadGbk("data/bsub.gbk") // writing out gbk file input as json. @@ -154,7 +154,7 @@ To get an AnnotatedSequence sequence you can address it like so: ## Parsers -`poly` parsers are what actually parse input files from a string without any of the system IO. This is particularly useful if you're like me and have an old database holding genbank files as strings. You can take those strings from a database or whatever and just pass them to `ParseGbk()`, or `ParseGff()` and they'll convert them into AnnotatedSequence structs. +`poly` parsers are what actually parse input files from a string without any of the system IO. This is particularly useful if you're like me and have an old database holding genbank files as strings. You can take those strings from a database or whatever and just pass them to `ParseGbk()`, or `ParseGff()` and they'll convert them into Sequence structs. ```go puc19AnnotatedSequence := ParseGbk("imagine this is actually a gbk in string format.") @@ -164,10 +164,10 @@ That's it. The reason we don't have a `ParseJSON()` is that golang, like almost ## Builders -`poly` builders take AnnotatedSequence structs and use them to build strings for different file formats. +`poly` builders take Sequence structs and use them to build strings for different file formats. ```go - // generating an AnnotatedSequence struct from a gff file. + // generating an Sequence struct from a gff file. ecoliAnnotatedSequence := ReadGff("data/ecoli-mg1655.gff") // generating a gff string that then can be piped to stdout or written to a database. diff --git a/hash.go b/hash.go index 0da19b01..475f9187 100644 --- a/hash.go +++ b/hash.go @@ -95,12 +95,12 @@ func RotateSequence(sequence string) string { return sequence } -// Hash is a method wrapper for hashing AnnotatedSequence structs. -func (annotatedSequence AnnotatedSequence) Hash(hash hash.Hash) string { - if annotatedSequence.Meta.Locus.Circular { - annotatedSequence.Sequence.Sequence = RotateSequence(annotatedSequence.Sequence.Sequence) +// Hash is a method wrapper for hashing Sequence structs. +func (sequence Sequence) Hash(hash hash.Hash) string { + if sequence.Meta.Locus.Circular { + sequence.Sequence = RotateSequence(sequence.Sequence) } - seqHash, _ := hashSequence(annotatedSequence.Sequence.Sequence, hash) + seqHash, _ := hashSequence(sequence.Sequence, hash) return seqHash } diff --git a/hash_test.go b/hash_test.go index 799ff529..44dff797 100644 --- a/hash_test.go +++ b/hash_test.go @@ -29,10 +29,10 @@ func TestHashRegression(t *testing.T) { } func TestLeastRotation(t *testing.T) { - annotatedSequence := ReadGbk("data/puc19.gbk") + sequence := ReadGbk("data/puc19.gbk") var sequenceBuffer bytes.Buffer - sequenceBuffer.WriteString(annotatedSequence.Sequence.Sequence) + sequenceBuffer.WriteString(sequence.Sequence) bufferLength := sequenceBuffer.Len() var rotatedSequence string diff --git a/io.go b/io.go index dcfdb0c3..39a9c8ff 100644 --- a/io.go +++ b/io.go @@ -13,21 +13,12 @@ import ( "github.com/mitchellh/go-wordwrap" ) -// TAB just represents "\t" used in GBK IO. -const TAB = "\t" - -// FIVESPACE just represents 5 space characters. Used in GBK IO -const FIVESPACE = " " - -// TENSPACE just represents 10 space characters. Used in GBK IO -const TENSPACE = FIVESPACE + FIVESPACE - /****************************************************************************** File is structured as so: Structs: - AnnotatedSequence - main struct for sequence handling plus sub structs. + Sequence - main struct for sequence handling plus sub structs. File specific parsers, builders readers, and writers: Gff - parser, builder, reader, writer @@ -40,11 +31,11 @@ File specific parsers, builders readers, and writers: /****************************************************************************** -AnnotatedSequence related structs begin here. +Sequence related structs begin here. ******************************************************************************/ -// Meta Holds all the meta information of an AnnotatedSequence struct. +// Meta Holds all the meta information of an Sequence struct. type Meta struct { Name string `json:"name"` GffVersion string `json:"gff_version"` @@ -103,45 +94,41 @@ type Location struct { type Feature struct { Name string //Seqid in gff, name in gbk //gff specific - Source string `json:"source"` - Type string `json:"type"` - Score string `json:"score"` - Strand string `json:"strand"` - Phase string `json:"phase"` - Attributes map[string]string `json:"attributes"` - GbkLocationString string `json:"gbk_location_string"` - Sequence string `json:"sequence"` - SequenceLocation Location `json:"sequence_location"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` - ParentAnnotatedSequence *AnnotatedSequence `json:"-"` + Source string `json:"source"` + Type string `json:"type"` + Score string `json:"score"` + Strand string `json:"strand"` + Phase string `json:"phase"` + Attributes map[string]string `json:"attributes"` + GbkLocationString string `json:"gbk_location_string"` + Sequence string `json:"sequence"` + SequenceLocation Location `json:"sequence_location"` + SequenceHash string `json:"sequence_hash"` + Description string `json:"description"` + SequenceHashFunction string `json:"hash_function"` + ParentSequence *Sequence `json:"-"` } -// Sequence holds raw sequence information in an AnnotatedSequence struct. +// Sequence holds all sequence information in a single struct. type Sequence struct { - Description string `json:"description"` - SequenceHash string `json:"sequence_hash"` - SequenceHashFunction string `json:"hash_function"` - Sequence string `json:"sequence"` - // ParentAnnotatedSequence *AnnotatedSequence + Meta Meta `json:"meta"` + Description string `json:"description"` + SequenceHash string `json:"sequence_hash"` + SequenceHashFunction string `json:"hash_function"` + Sequence string `json:"sequence"` + Features []Feature `json:"features"` } -// AnnotatedSequence holds all sequence information in a single struct. -type AnnotatedSequence struct { - Meta Meta `json:"meta"` - Features []Feature `json:"features"` - Sequence Sequence `json:"sequence"` -} - -func (annotatedSequence *AnnotatedSequence) addFeature(feature Feature) []Feature { - feature.ParentAnnotatedSequence = annotatedSequence - annotatedSequence.Features = append(annotatedSequence.Features, feature) - return annotatedSequence.Features +// AddFeature is the canonical way to add a Feature into a Sequence struct. Appending a Feature struct directly to Sequence.Feature's will break .GetSequence() method. +func (sequence *Sequence) AddFeature(feature Feature) []Feature { + feature.ParentSequence = sequence + sequence.Features = append(sequence.Features, feature) + return sequence.Features } /****************************************************************************** -AnnotatedSequence related structs end here. +Sequence related structs end here. ******************************************************************************/ @@ -151,9 +138,9 @@ GFF specific IO related things begin here. ******************************************************************************/ -// ParseGff Takes in a string representing a gffv3 file and parses it into an AnnotatedSequence object. -func ParseGff(gff string) AnnotatedSequence { - annotatedSequence := AnnotatedSequence{} +// ParseGff Takes in a string representing a gffv3 file and parses it into an Sequence object. +func ParseGff(gff string) Sequence { + sequence := Sequence{} lines := strings.Split(gff, "\n") metaString := lines[0:2] @@ -167,8 +154,6 @@ func ParseGff(gff string) AnnotatedSequence { meta.RegionEnd, _ = strconv.Atoi(regionStringArray[3]) meta.Size = meta.RegionEnd - meta.RegionStart - // records := []Feature{} - sequence := Sequence{} var sequenceBuffer bytes.Buffer fastaFlag := false for _, line := range lines { @@ -190,7 +175,7 @@ func ParseGff(gff string) AnnotatedSequence { record.Source = fields[1] record.Type = fields[2] - // Indexing starts at 1 for gff so we need to shift down for AnnotatedSequence 0 index. + // Indexing starts at 1 for gff so we need to shift down for Sequence 0 index. record.SequenceLocation.Start, _ = strconv.Atoi(fields[3]) record.SequenceLocation.Start-- record.SequenceLocation.End, _ = strconv.Atoi(fields[4]) @@ -209,23 +194,22 @@ func ParseGff(gff string) AnnotatedSequence { value := attributeSplit[1] record.Attributes[key] = value } - annotatedSequence.addFeature(record) + sequence.AddFeature(record) } } sequence.Sequence = sequenceBuffer.String() - annotatedSequence.Meta = meta - annotatedSequence.Sequence = sequence + sequence.Meta = meta - return annotatedSequence + return sequence } // BuildGff takes an Annotated sequence and returns a byte array representing a gff to be written out. -func BuildGff(annotatedSequence AnnotatedSequence) []byte { +func BuildGff(sequence Sequence) []byte { var gffBuffer bytes.Buffer var versionString string - if annotatedSequence.Meta.GffVersion != "" { - versionString = "##gff-version " + annotatedSequence.Meta.GffVersion + "\n" + if sequence.Meta.GffVersion != "" { + versionString = "##gff-version " + sequence.Meta.GffVersion + "\n" } else { versionString = "##gff-version 3 \n" } @@ -236,30 +220,30 @@ func BuildGff(annotatedSequence AnnotatedSequence) []byte { var start string var end string - if annotatedSequence.Meta.Name != "" { - name = annotatedSequence.Meta.Name - } else if annotatedSequence.Meta.Locus.Name != "" { - name = annotatedSequence.Meta.Locus.Name - } else if annotatedSequence.Meta.Accession != "" { - name = annotatedSequence.Meta.Accession + if sequence.Meta.Name != "" { + name = sequence.Meta.Name + } else if sequence.Meta.Locus.Name != "" { + name = sequence.Meta.Locus.Name + } else if sequence.Meta.Accession != "" { + name = sequence.Meta.Accession } else { name = "unknown" } - if annotatedSequence.Meta.RegionStart != 0 { - start = strconv.Itoa(annotatedSequence.Meta.RegionStart) + if sequence.Meta.RegionStart != 0 { + start = strconv.Itoa(sequence.Meta.RegionStart) } else { start = "1" } - if annotatedSequence.Meta.RegionEnd != 0 { - end = strconv.Itoa(annotatedSequence.Meta.RegionEnd) - } else if annotatedSequence.Meta.Locus.SequenceLength != "" { + if sequence.Meta.RegionEnd != 0 { + end = strconv.Itoa(sequence.Meta.RegionEnd) + } else if sequence.Meta.Locus.SequenceLength != "" { reg, err := regexp.Compile("[^0-9]+") if err != nil { log.Fatal(err) } - end = reg.ReplaceAllString(annotatedSequence.Meta.Locus.SequenceLength, "") + end = reg.ReplaceAllString(sequence.Meta.Locus.SequenceLength, "") } else { end = "1" } @@ -267,14 +251,14 @@ func BuildGff(annotatedSequence AnnotatedSequence) []byte { regionString = "##sequence-region " + name + " " + start + " " + end + "\n" gffBuffer.WriteString(regionString) - for _, feature := range annotatedSequence.Features { + for _, feature := range sequence.Features { var featureString string var featureName string if feature.Name != "" { featureName = feature.Name } else { - featureName = annotatedSequence.Meta.Name + featureName = sequence.Meta.Name } var featureSource string @@ -291,7 +275,7 @@ func BuildGff(annotatedSequence AnnotatedSequence) []byte { featureType = "unknown" } - // Indexing starts at 1 for gff so we need to shift up from AnnotatedSequence 0 index. + // Indexing starts at 1 for gff so we need to shift up from Sequence 0 index. featureStart := strconv.Itoa(feature.SequenceLocation.Start + 1) featureEnd := strconv.Itoa(feature.SequenceLocation.End) @@ -321,9 +305,9 @@ func BuildGff(annotatedSequence AnnotatedSequence) []byte { gffBuffer.WriteString("###\n") gffBuffer.WriteString("##FASTA\n") - gffBuffer.WriteString(">" + annotatedSequence.Meta.Name + "\n") + gffBuffer.WriteString(">" + sequence.Meta.Name + "\n") - for letterIndex, letter := range annotatedSequence.Sequence.Sequence { + for letterIndex, letter := range sequence.Sequence { letterIndex++ if letterIndex%70 == 0 && letterIndex != 0 { gffBuffer.WriteRune(letter) @@ -337,20 +321,20 @@ func BuildGff(annotatedSequence AnnotatedSequence) []byte { } // ReadGff takes in a filepath for a .gffv3 file and parses it into an Annotated Sequence struct. -func ReadGff(path string) AnnotatedSequence { +func ReadGff(path string) Sequence { file, err := ioutil.ReadFile(path) - var annotatedSequence AnnotatedSequence + var sequence Sequence if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) } else { - annotatedSequence = ParseGff(string(file)) + sequence = ParseGff(string(file)) } - return annotatedSequence + return sequence } -// WriteGff takes an AnnotatedSequence struct and a path string and writes out a gff to that path. -func WriteGff(annotatedSequence AnnotatedSequence, path string) { - gff := BuildGff(annotatedSequence) +// WriteGff takes an Sequence struct and a path string and writes out a gff to that path. +func WriteGff(sequence Sequence, path string) { + gff := BuildGff(sequence) _ = ioutil.WriteFile(path, gff, 0644) } @@ -366,32 +350,32 @@ JSON specific IO related things begin here. ******************************************************************************/ -// ParseJSON parses an AnnotatedSequence JSON file and adds appropriate pointers to struct. -func ParseJSON(file []byte) AnnotatedSequence { - var annotatedSequence AnnotatedSequence - json.Unmarshal([]byte(file), &annotatedSequence) - legacyFeatures := annotatedSequence.Features - annotatedSequence.Features = []Feature{} +// ParseJSON parses an Sequence JSON file and adds appropriate pointers to struct. +func ParseJSON(file []byte) Sequence { + var sequence Sequence + json.Unmarshal([]byte(file), &sequence) + legacyFeatures := sequence.Features + sequence.Features = []Feature{} for _, feature := range legacyFeatures { - annotatedSequence.addFeature(feature) + sequence.AddFeature(feature) } - return annotatedSequence + return sequence } -// ReadJSON reads an AnnotatedSequence JSON file. -func ReadJSON(path string) AnnotatedSequence { +// ReadJSON reads an Sequence JSON file. +func ReadJSON(path string) Sequence { file, err := ioutil.ReadFile(path) if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) } - annotatedSequence := ParseJSON(file) - return annotatedSequence + sequence := ParseJSON(file) + return sequence } -// WriteJSON writes an AnnotatedSequence struct out to json. -func WriteJSON(annotatedSequence AnnotatedSequence, path string) { - file, _ := json.MarshalIndent(annotatedSequence, "", " ") +// WriteJSON writes an Sequence struct out to json. +func WriteJSON(sequence Sequence, path string) { + file, _ := json.MarshalIndent(sequence, "", " ") _ = ioutil.WriteFile(path, file, 0644) } @@ -407,55 +391,91 @@ FASTA specific IO related things begin here. ******************************************************************************/ -// ParseFASTA parses an array of AnnotatedSequence structs from a FASTA file and adds appropriate pointers to the structs. -func ParseFASTA(fasta string) []AnnotatedSequence { +// ParseFASTA parses a Sequence struct from a FASTA file and adds appropriate pointers to the structs. +func ParseFASTA(fasta string) Sequence { - annotatedSequenceArray := []AnnotatedSequence{} - currentAnnotatedSequence := AnnotatedSequence{} + var sequence Sequence + var feature Feature + var features []Feature + var sequenceBuffer bytes.Buffer + var start int + var end int lines := strings.Split(fasta, "\n") - meta := Meta{} + linesLength := len(lines) - 1 - sequence := Sequence{} - var sequenceBuffer bytes.Buffer - for _, line := range lines { + for lineIndex, line := range lines { + + // if there's nothing on this line skip this iteration of the loop if len(line) == 0 { - // save the current seq - sequence.Sequence = sequenceBuffer.String() - currentAnnotatedSequence.Meta = meta - currentAnnotatedSequence.Sequence = sequence - annotatedSequenceArray = append(annotatedSequenceArray, currentAnnotatedSequence) - - // reset the seq - sequenceBuffer.Reset() - sequence = Sequence{} - meta = Meta{} - currentAnnotatedSequence = AnnotatedSequence{} - - } else if line[0:1] == ">" { - sequence.Description = line[1:] + continue + } + + // if it's a comment skip this line + if line[0:1] == ";" { + continue + } + + if line[0:1] == ">" && lineIndex == 0 { // if it's the first description + feature.Description = line[1:] + + } else if line[0:1] == ">" || lineIndex == linesLength { // if it's a description or the last line + + // if end of file write line to buffer + if lineIndex == linesLength { + sequenceBuffer.WriteString(line) + } + + // setting sequence location + feature.SequenceLocation.Start = start + end = len(sequenceBuffer.String()) + feature.SequenceLocation.End = end + + // setting start to end after assigning to location in feature. + start = end + + // adding new feature to features slice + features = append(features, feature) + + // resetting feature + feature = Feature{} + + // if it's the last line + if lineIndex != linesLength { + feature.Description = line[1:] + } + } else { sequenceBuffer.WriteString(line) } } - if len(sequenceBuffer.Bytes()) > 0 { - sequence.Sequence = sequenceBuffer.String() - currentAnnotatedSequence.Meta = meta - currentAnnotatedSequence.Sequence = sequence - annotatedSequenceArray = append(annotatedSequenceArray, currentAnnotatedSequence) + sequence.Sequence = sequenceBuffer.String() + + // add features last so that internal pointer to parent sequence is accurate + for _, feature := range features { + sequence.AddFeature(feature) } - return annotatedSequenceArray + return sequence } -// BuildFASTA builds a FASTA string from an array of AnnotatedSequence structs. -func BuildFASTA(annotatedSequenceArray []AnnotatedSequence) []byte { +// BuildFASTA builds a FASTA string from a Sequence struct. +func BuildFASTA(sequence Sequence) []byte { var fastaBuffer bytes.Buffer - const maxLineLength = 80 - for _, annotatedSequence := range annotatedSequenceArray { - fastaBuffer.WriteString(">" + annotatedSequence.Sequence.Description + "\n") - for characterIndex, character := range annotatedSequence.Sequence.Sequence { + const maxLineLength = 70 + + for featureIndex, feature := range sequence.Features { + + // if there isn't a descriptive comment don't write out feature to fasta file. + if feature.Description == "" { + continue + } + // write feature comment + fastaBuffer.WriteString(">" + feature.Description + "\n") + + // range over sequence and add spacing + for characterIndex, character := range feature.GetSequence() { characterIndex++ if characterIndex%maxLineLength == 0 && characterIndex != 0 { fastaBuffer.WriteRune(character) @@ -464,14 +484,18 @@ func BuildFASTA(annotatedSequenceArray []AnnotatedSequence) []byte { fastaBuffer.WriteRune(character) } } - fastaBuffer.WriteString("\n\n") + + // if it's the end write new line. + if featureIndex != len(sequence.Features)-1 { + fastaBuffer.WriteString("\n\n") + } } return fastaBuffer.Bytes() } -// ReadFASTA reads an array of AnnotatedSequence structs from a FASTA file. -func ReadFASTA(path string) []AnnotatedSequence { +// ReadFASTA reads a Sequence struct from a FASTA file. +func ReadFASTA(path string) Sequence { file, err := ioutil.ReadFile(path) if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) @@ -480,9 +504,9 @@ func ReadFASTA(path string) []AnnotatedSequence { return annotatedSequenceArray } -// WriteFASTA writes an array of AnnotatedSequence structs out to FASTA. -func WriteFASTA(annotatedSequenceArray []AnnotatedSequence, path string) { - _ = ioutil.WriteFile(path, BuildFASTA(annotatedSequenceArray), 0644) +// WriteFASTA writes a Sequence struct out to FASTA. +func WriteFASTA(sequence Sequence, path string) { + _ = ioutil.WriteFile(path, BuildFASTA(sequence), 0644) } /****************************************************************************** @@ -497,14 +521,11 @@ GBK specific IO related things begin here. ******************************************************************************/ -// ParseGbk takes in a string representing a gbk/gb/genbank file and parses it into an AnnotatedSequence object. -func ParseGbk(gbk string) AnnotatedSequence { +// ParseGbk takes in a string representing a gbk/gb/genbank file and parses it into an Sequence object. +func ParseGbk(gbk string) Sequence { lines := strings.Split(gbk, "\n") - // create top level Annotated Sequence struct - var annotatedSequence AnnotatedSequence - // Create meta struct meta := Meta{} meta.Other = make(map[string]string) @@ -550,7 +571,7 @@ func ParseGbk(gbk string) AnnotatedSequence { case "FEATURES": features = getFeatures(subLines) case "ORIGIN": - sequence = getSequence(subLines) + sequence.Sequence = getSequence(subLines) sequenceBreakFlag = true default: if quickMetaCheck(line) { @@ -562,23 +583,20 @@ func ParseGbk(gbk string) AnnotatedSequence { } // add meta to annotated sequence - annotatedSequence.Meta = meta + sequence.Meta = meta // add features to annotated sequence with pointer to annotated sequence in each feature for _, feature := range features { - annotatedSequence.addFeature(feature) + sequence.AddFeature(feature) } - // add sequence to annotated sequence - annotatedSequence.Sequence = sequence - - return annotatedSequence + return sequence } // BuildGbk builds a GBK string to be written out to db or file. -func BuildGbk(annotatedSequence AnnotatedSequence) []byte { +func BuildGbk(sequence Sequence) []byte { var gbkString bytes.Buffer - locus := annotatedSequence.Meta.Locus + locus := sequence.Meta.Locus var shape string if locus.Circular { @@ -586,33 +604,36 @@ func BuildGbk(annotatedSequence AnnotatedSequence) []byte { } else if locus.Linear { shape = "linear" } + + fivespace := generateWhiteSpace(subMetaIndex) + // building locus - locusData := locus.Name + FIVESPACE + locus.SequenceLength + " bp" + FIVESPACE + locus.MoleculeType + FIVESPACE + shape + FIVESPACE + locus.GenbankDivision + FIVESPACE + locus.ModificationDate + locusData := locus.Name + fivespace + locus.SequenceLength + " bp" + fivespace + locus.MoleculeType + fivespace + shape + fivespace + locus.GenbankDivision + fivespace + locus.ModificationDate locusString := "LOCUS " + locusData + "\n" gbkString.WriteString(locusString) // building other standard meta features - definitionString := buildMetaString("DEFINITION", annotatedSequence.Meta.Definition) + definitionString := buildMetaString("DEFINITION", sequence.Meta.Definition) gbkString.WriteString(definitionString) - accessionString := buildMetaString("ACCESSION", annotatedSequence.Meta.Accession) + accessionString := buildMetaString("ACCESSION", sequence.Meta.Accession) gbkString.WriteString(accessionString) - versionString := buildMetaString("VERSION", annotatedSequence.Meta.Version) + versionString := buildMetaString("VERSION", sequence.Meta.Version) gbkString.WriteString(versionString) - keywordsString := buildMetaString("KEYWORDS", annotatedSequence.Meta.Keywords) + keywordsString := buildMetaString("KEYWORDS", sequence.Meta.Keywords) gbkString.WriteString(keywordsString) - sourceString := buildMetaString("SOURCE", annotatedSequence.Meta.Source) + sourceString := buildMetaString("SOURCE", sequence.Meta.Source) gbkString.WriteString(sourceString) - organismString := buildMetaString(" ORGANISM", annotatedSequence.Meta.Organism) + organismString := buildMetaString(" ORGANISM", sequence.Meta.Organism) gbkString.WriteString(organismString) // building references // TODO: could use reflection to get keys and make more general. - for referenceIndex, reference := range annotatedSequence.Meta.References { + for referenceIndex, reference := range sequence.Meta.References { referenceData := strconv.Itoa(referenceIndex+1) + " " + reference.Range referenceString := buildMetaString("REFERENCE", referenceData) gbkString.WriteString(referenceString) @@ -640,19 +661,19 @@ func BuildGbk(annotatedSequence AnnotatedSequence) []byte { } // building other meta fields that are catch all - otherKeys := make([]string, 0, len(annotatedSequence.Meta.Other)) - for key := range annotatedSequence.Meta.Other { + otherKeys := make([]string, 0, len(sequence.Meta.Other)) + for key := range sequence.Meta.Other { otherKeys = append(otherKeys, key) } for _, otherKey := range otherKeys { - otherString := buildMetaString(otherKey, annotatedSequence.Meta.Other[otherKey]) + otherString := buildMetaString(otherKey, sequence.Meta.Other[otherKey]) gbkString.WriteString(otherString) } // start writing features section. gbkString.WriteString("FEATURES Location/Qualifiers\n") - for _, feature := range annotatedSequence.Features { + for _, feature := range sequence.Features { gbkString.WriteString(buildGbkFeatureString(feature)) } @@ -660,7 +681,7 @@ func BuildGbk(annotatedSequence AnnotatedSequence) []byte { gbkString.WriteString("ORIGIN\n") // iterate over every character in sequence range. - for index, base := range annotatedSequence.Sequence.Sequence { + for index, base := range sequence.Sequence { // if 60th character add newline then whitespace and index number and space before adding next base. if index%60 == 0 { if index != 0 { @@ -689,22 +710,22 @@ func BuildGbk(annotatedSequence AnnotatedSequence) []byte { } // ReadGbk reads a Gbk from path and parses into an Annotated sequence struct. -func ReadGbk(path string) AnnotatedSequence { +func ReadGbk(path string) Sequence { file, err := ioutil.ReadFile(path) - var annotatedSequence AnnotatedSequence + var sequence Sequence if err != nil { // return 0, fmt.Errorf("Failed to open file %s for unpack: %s", gzFilePath, err) } else { gbkString := string(file) - annotatedSequence = ParseGbk(gbkString) + sequence = ParseGbk(gbkString) } - return annotatedSequence + return sequence } -// WriteGbk takes an AnnotatedSequence struct and a path string and writes out a gff to that path. -func WriteGbk(annotatedSequence AnnotatedSequence, path string) { - gbk := BuildGbk(annotatedSequence) +// WriteGbk takes an Sequence struct and a path string and writes out a gff to that path. +func WriteGbk(sequence Sequence, path string) { + gbk := BuildGbk(sequence) _ = ioutil.WriteFile(path, gbk, 0644) } @@ -991,63 +1012,6 @@ func topLevelFeatureCheck(featureString string) bool { return flag } -// checks for only sub level features in genbankSubLevelFeatures array -func subLevelFeatureCheck(featureString string) bool { - flag := false - cleanedFeatureString := strings.TrimSpace(featureString) - for _, feature := range genbankSubLevelFeatures { - if feature == cleanedFeatureString { - flag = true - break - } - } - return flag -} - -// checks for both sub and top level features in genbankSubLevelFeatures and genbankTopLevelFeatures array -func allLevelFeatureCheck(featureString string) bool { - flag := false - cleanedFeatureString := strings.TrimSpace(featureString) - if subLevelFeatureCheck(cleanedFeatureString) || topLevelFeatureCheck(cleanedFeatureString) { - flag = true - } - return flag -} - -// will eventually refactor all checks into one function. -func geneFeatureTypeCheck(featureString string) bool { - flag := false - cleanedFeatureString := strings.TrimSpace(featureString) - for _, feature := range genbankGeneFeatureTypes { - if feature == cleanedFeatureString { - flag = true - break - } - } - return flag -} - -func geneQualifierTypeCheck(featureString string) bool { - flag := false - cleanedFeatureString := strings.TrimSpace(strings.SplitAfter(featureString, "=")[0]) - for _, feature := range genbankGeneQualifierTypes { - if feature == cleanedFeatureString { - flag = true - break - } - } - return flag -} - -func allGeneTypeCheck(featureString string) bool { - flag := false - cleanedFeatureString := strings.TrimSpace(featureString) - if geneQualifierTypeCheck(cleanedFeatureString) || topLevelFeatureCheck(cleanedFeatureString) { - flag = true - } - return flag -} - // parses locus from provided string. func parseLocus(locusString string) Locus { locus := Locus{} @@ -1269,9 +1233,8 @@ func getFeatures(lines []string) []Feature { return features } -// takes every line after origin feature and removes anything that isn't in the alphabet. Returns sequence. -func getSequence(subLines []string) Sequence { - sequence := Sequence{} +// takes every line after origin feature and removes anything that isn't in the alphabet. Returns sequence string. +func getSequence(subLines []string) string { var sequenceBuffer bytes.Buffer reg, err := regexp.Compile("[^a-zA-Z]+") if err != nil { @@ -1280,7 +1243,7 @@ func getSequence(subLines []string) Sequence { for _, subLine := range subLines { sequenceBuffer.WriteString(subLine) } - sequence.Sequence = reg.ReplaceAllString(sequenceBuffer.String(), "") + sequence := reg.ReplaceAllString(sequenceBuffer.String(), "") return sequence } @@ -1360,7 +1323,7 @@ func buildMetaString(name string, data string) string { if index == 0 { returnData = name + datum + "\n" } else { - returnData += TENSPACE + " " + datum + "\n" + returnData += generateWhiteSpace(11) + datum + "\n" } } @@ -1398,11 +1361,8 @@ func buildGbkLocationString(location Location) string { // buildGbkFeatureString is a helper function to build gbk feature strings for BuildGbk() func buildGbkFeatureString(feature Feature) string { - whitespaceTrailLength := 16 - len(feature.Type) // I wish I was kidding. - var whitespaceTrail string - for i := 0; i < whitespaceTrailLength; i++ { - whitespaceTrail += " " - } + whiteSpaceTrailLength := 16 - len(feature.Type) // I wish I was kidding. + whiteSpaceTrail := generateWhiteSpace(whiteSpaceTrailLength) var location string if feature.GbkLocationString != "" { @@ -1410,7 +1370,7 @@ func buildGbkFeatureString(feature Feature) string { } else { location = buildGbkLocationString(feature.SequenceLocation) } - featureHeader := FIVESPACE + feature.Type + whitespaceTrail + location + "\n" + featureHeader := generateWhiteSpace(subMetaIndex) + feature.Type + whiteSpaceTrail + location + "\n" returnString := featureHeader qualifierKeys := make([]string, 0, len(feature.Attributes)) @@ -1419,12 +1379,22 @@ func buildGbkFeatureString(feature Feature) string { } for _, qualifier := range qualifierKeys { - returnString += " " + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n" + returnString += generateWhiteSpace(qualifierIndex) + "/" + qualifier + "=\"" + feature.Attributes[qualifier] + "\"\n" } return returnString } +func generateWhiteSpace(length int) string { + var spaceBuilder strings.Builder + + for i := 0; i < length; i++ { + spaceBuilder.WriteString(" ") + } + + return spaceBuilder.String() +} + /****************************************************************************** GBK specific IO related things end here. diff --git a/io_test.go b/io_test.go index a8e642c7..a5df1537 100644 --- a/io_test.go +++ b/io_test.go @@ -27,7 +27,6 @@ Gff related tests and benchmarks begin here. ******************************************************************************/ -// TODO should delete output files. func TestGffIO(t *testing.T) { testInputPath := "data/ecoli-mg1655.gff" testOutputPath := "data/test.gff" @@ -37,7 +36,7 @@ func TestGffIO(t *testing.T) { readTestSequence := ReadGff(testOutputPath) - if diff := cmp.Diff(testSequence, readTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(testSequence, readTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { t.Errorf("Parsing the output of BuildGff() does not produce the same output as parsing the original file read with ReadGff(). Got this diff:\n%s", diff) } @@ -91,7 +90,7 @@ func TestGbkIO(t *testing.T) { WriteGbk(gbk, "data/puc19gbktest.gbk") writeTestGbk := ReadGbk("data/puc19gbktest.gbk") os.Remove("data/puc19gbktest.gbk") - if diff := cmp.Diff(gbk, writeTestGbk, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(gbk, writeTestGbk, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { t.Errorf("Parsing the output of BuildGbk() does not produce the same output as parsing the original file read with ReadGbk(). Got this diff:\n%s", diff) } } @@ -111,7 +110,7 @@ func TestGbkLocationStringBuilder(t *testing.T) { os.Remove("data/sample_test.gbk") - if diff := cmp.Diff(testInputGbk, testOutputGbk, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(testInputGbk, testOutputGbk, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { t.Errorf("Issue with partial location building. Parsing the output of BuildGbk() does not produce the same output as parsing the original file read with ReadGbk(). Got this diff:\n%s", diff) } @@ -129,7 +128,7 @@ func TestGbkLocationStringBuilder(t *testing.T) { os.Remove("data/t4_intron_test.gbk") - if diff := cmp.Diff(testInputGbk, testOutputGbk, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(testInputGbk, testOutputGbk, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { t.Errorf("Issue with either Join or complement location building. Parsing the output of BuildGbk() does not produce the same output as parsing the original file read with ReadGbk(). Got this diff:\n%s", diff) } @@ -151,7 +150,7 @@ func TestLocusParseRegression(t *testing.T) { gbk := ReadGbk("data/puc19.gbk").Meta.Locus json := ReadJSON("data/puc19static.json").Meta.Locus - if diff := cmp.Diff(gbk, json, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(gbk, json, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { t.Errorf("The meta parser has changed behaviour. Got this diff:\n%s", diff) } } @@ -159,7 +158,7 @@ func TestLocusParseRegression(t *testing.T) { func TestSnapgeneGenbankRegression(t *testing.T) { snapgene := ReadGbk("data/puc19_snapgene.gb") - if snapgene.Sequence.Sequence == "" { + if snapgene.Sequence == "" { t.Errorf("Parsing snapgene returned an empty string") } } @@ -209,7 +208,7 @@ func TestJSONIO(t *testing.T) { // cleaning up test data os.Remove("data/test.json") - if diff := cmp.Diff(testSequence, readTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(testSequence, readTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { t.Errorf(" mismatch (-want +got):\n%s", diff) } @@ -220,7 +219,7 @@ func TestJSONIO(t *testing.T) { // cleaning up test data os.Remove("data/test.json") - if diff := cmp.Diff(gffTestSequence, gffReadTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(gffTestSequence, gffReadTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { // t.Errorf(" mismatch (-want +got):\n%s", diff) } @@ -254,10 +253,8 @@ func TestFASTAIO(t *testing.T) { // cleanup os.Remove(testOutputFilename) - for index := range testSequence { - if diff := cmp.Diff(testSequence[index], readTestSequence[index], cmpopts.IgnoreFields(Feature{}, "ParentAnnotatedSequence")); diff != "" { - t.Errorf(" mismatch (-want +got):\n%s", diff) - } + if diff := cmp.Diff(testSequence, readTestSequence, cmpopts.IgnoreFields(Feature{}, "ParentSequence")); diff != "" { + t.Errorf(" mismatch (-want +got):\n%s", diff) } } diff --git a/poly/commands.go b/poly/commands.go index bbbe43e0..0e6d18b8 100644 --- a/poly/commands.go +++ b/poly/commands.go @@ -78,17 +78,17 @@ parse them, and then spit out a similiarly named file with the .json extension. func convertCommand(c *cli.Context) error { if isPipe(c) { - annotatedSequence := parseStdin(c) + sequence := parseStdin(c) var output []byte // logic for chosing output format, then builds string to be output. if c.String("o") == "json" { - output, _ = json.MarshalIndent(annotatedSequence, "", " ") + output, _ = json.MarshalIndent(sequence, "", " ") } else if c.String("o") == "gff" { - output = poly.BuildGff(annotatedSequence) + output = poly.BuildGff(sequence) } else if c.String("o") == "gbk" || c.String("o") == "gb" { - output = poly.BuildGbk(annotatedSequence) + output = poly.BuildGbk(sequence) } // output to stdout @@ -115,16 +115,16 @@ func convertCommand(c *cli.Context) error { // executing Go routine. go func(match string) { extension := filepath.Ext(match) - annotatedSequence := fileParser(c, match) + sequence := fileParser(c, match) // determining output format and name, then writing out to name. outputPath := match[0 : len(match)-len(extension)] if c.String("o") == "json" { - poly.WriteJSON(annotatedSequence, outputPath+".json") + poly.WriteJSON(sequence, outputPath+".json") } else if c.String("o") == "gff" { - poly.WriteGff(annotatedSequence, outputPath+".gff") + poly.WriteGff(sequence, outputPath+".gff") } else if c.String("o") == "gbk" || c.String("o") == "gb" { - poly.WriteGbk(annotatedSequence, outputPath+".gbk") + poly.WriteGbk(sequence, outputPath+".gbk") } // decrementing wait group. @@ -173,8 +173,8 @@ parse them, and then spit out a similiarly named file with the .json extension a func hashCommand(c *cli.Context) error { if isPipe(c) { - annotatedSequence := parseStdin(c) // get sequence from stdin - sequenceHash := flagSwitchHash(c, annotatedSequence) // get hash include no-op which only rotates the sequence + sequence := parseStdin(c) // get sequence from stdin + sequenceHash := flagSwitchHash(c, sequence) // get hash include no-op which only rotates the sequence // handler for outputting String to stdout <- Default for pipes if c.String("o") == "string" { @@ -189,9 +189,9 @@ func hashCommand(c *cli.Context) error { // handler for outputting JSON to stdout if c.String("o") == "json" { - annotatedSequence.Sequence.SequenceHash = sequenceHash // adding hash to JSON - annotatedSequence.Sequence.SequenceHashFunction = strings.ToUpper(c.String("f")) // adding hash type to JSON - output, _ := json.MarshalIndent(annotatedSequence, "", " ") + sequence.SequenceHash = sequenceHash // adding hash to JSON + sequence.SequenceHashFunction = strings.ToUpper(c.String("f")) // adding hash type to JSON + output, _ := json.MarshalIndent(sequence, "", " ") fmt.Fprint(c.App.Writer, string(output)) } @@ -212,8 +212,8 @@ func hashCommand(c *cli.Context) error { // executing Go routine. go func(match string) { extension := filepath.Ext(match) - annotatedSequence := fileParser(c, match) - sequenceHash := flagSwitchHash(c, annotatedSequence) + sequence := fileParser(c, match) + sequenceHash := flagSwitchHash(c, sequence) // handler for outputting String <- Default if strings.ToLower(c.String("o")) == "string" { @@ -241,16 +241,16 @@ func hashCommand(c *cli.Context) error { // handler for outputting JSON. if strings.ToLower(c.String("o")) == "json" { - annotatedSequence.Sequence.SequenceHash = sequenceHash - annotatedSequence.Sequence.SequenceHashFunction = strings.ToUpper(c.String("f")) + sequence.SequenceHash = sequenceHash + sequence.SequenceHashFunction = strings.ToUpper(c.String("f")) if c.Bool("--log") == true { - output, _ := json.MarshalIndent(annotatedSequence, "", " ") + output, _ := json.MarshalIndent(sequence, "", " ") fmt.Fprint(c.App.Writer, string(output)) } else { outputPath := match[0 : len(match)-len(extension)] // should have way to support wildcard matches for varied output names. - poly.WriteJSON(annotatedSequence, outputPath+".json") + poly.WriteJSON(sequence, outputPath+".json") } } @@ -300,19 +300,19 @@ func optimizeCommand(c *cli.Context) error { // if a file exists to weigh the table. Weigh it. if fileExists(c.String("wt")) { targetOrganism := fileParser(c, c.String("wt")) - codonTable.CreateWeights(targetOrganism.Sequence.Sequence) + codonTable.CreateWeights(targetOrganism.Sequence) } if isPipe(c) { - // uncomment below to parse annotatedSequence from pipe - annotatedSequence := parseStdin(c) + // uncomment below to parse sequence from pipe + sequence := parseStdin(c) var aminoAcids string if c.Bool("aa") { - aminoAcids = annotatedSequence.Sequence.Sequence + aminoAcids = sequence.Sequence } else { - aminoAcids = poly.Translate(annotatedSequence.Sequence.Sequence, codonTable) + aminoAcids = poly.Translate(sequence.Sequence, codonTable) } optimizedSequence := poly.Optimize(aminoAcids, codonTable) @@ -352,10 +352,10 @@ func translateCommand(c *cli.Context) error { codonTable = poly.DefaultCodonTablesByName[c.String("ct")] } - // uncomment below to parse annotatedSequence from pipe - annotatedSequence := parseStdin(c) + // uncomment below to parse sequence from pipe + sequence := parseStdin(c) - aminoAcids := poly.Translate(annotatedSequence.Sequence.Sequence, codonTable) + aminoAcids := poly.Translate(sequence.Sequence, codonTable) fmt.Fprintln(c.App.Writer, aminoAcids) @@ -416,67 +416,66 @@ func isNumeric(s string) bool { } // a simple helper function to take stdin from a pipe and parse it into an annotated sequence -func parseStdin(c *cli.Context) poly.AnnotatedSequence { - var annotatedSequence poly.AnnotatedSequence +func parseStdin(c *cli.Context) poly.Sequence { + var sequence poly.Sequence // logic for determining input format, then parses accordingly. if c.String("i") == "json" { - json.Unmarshal([]byte(stdinToString(c.App.Reader)), &annotatedSequence) + json.Unmarshal([]byte(stdinToString(c.App.Reader)), &sequence) } else if c.String("i") == "gbk" || c.String("i") == "gb" { - annotatedSequence = poly.ParseGbk(stdinToString(c.App.Reader)) + sequence = poly.ParseGbk(stdinToString(c.App.Reader)) } else if c.String("i") == "gff" { - annotatedSequence = poly.ParseGff(stdinToString(c.App.Reader)) + sequence = poly.ParseGff(stdinToString(c.App.Reader)) } else if c.String("i") == "string" { - annotatedSequence.Sequence.Sequence = stdinToString(c.App.Reader) + sequence.Sequence = stdinToString(c.App.Reader) } - return annotatedSequence + return sequence } // helper function to hash sequence based on flag using generic hash. -func flagSwitchHash(c *cli.Context, annotatedSequence poly.AnnotatedSequence) string { +func flagSwitchHash(c *cli.Context, sequence poly.Sequence) string { var hashString string switch strings.ToUpper(c.String("f")) { case "MD5": - hashString = annotatedSequence.Hash(crypto.MD5.New()) + hashString = sequence.Hash(crypto.MD5.New()) case "SHA1": - hashString = annotatedSequence.Hash(crypto.SHA1.New()) + hashString = sequence.Hash(crypto.SHA1.New()) case "SHA244": - hashString = annotatedSequence.Hash(crypto.SHA224.New()) + hashString = sequence.Hash(crypto.SHA224.New()) case "SHA256": - hashString = annotatedSequence.Hash(crypto.SHA256.New()) + hashString = sequence.Hash(crypto.SHA256.New()) case "SHA384": - hashString = annotatedSequence.Hash(crypto.SHA384.New()) + hashString = sequence.Hash(crypto.SHA384.New()) case "SHA512": - hashString = annotatedSequence.Hash(crypto.SHA512.New()) + hashString = sequence.Hash(crypto.SHA512.New()) case "RIPEMD160": - hashString = annotatedSequence.Hash(crypto.RIPEMD160.New()) + hashString = sequence.Hash(crypto.RIPEMD160.New()) case "SHA3_224": - hashString = annotatedSequence.Hash(crypto.SHA3_224.New()) + hashString = sequence.Hash(crypto.SHA3_224.New()) case "SHA3_256": - hashString = annotatedSequence.Hash(crypto.SHA3_256.New()) + hashString = sequence.Hash(crypto.SHA3_256.New()) case "SHA3_384": - hashString = annotatedSequence.Hash(crypto.SHA3_384.New()) + hashString = sequence.Hash(crypto.SHA3_384.New()) case "SHA3_512": - hashString = annotatedSequence.Hash(crypto.SHA3_512.New()) + hashString = sequence.Hash(crypto.SHA3_512.New()) case "SHA512_224": - hashString = annotatedSequence.Hash(crypto.SHA512_224.New()) + hashString = sequence.Hash(crypto.SHA512_224.New()) case "SHA512_256": - hashString = annotatedSequence.Hash(crypto.SHA512_256.New()) + hashString = sequence.Hash(crypto.SHA512_256.New()) case "BLAKE2s_256": - hashString = annotatedSequence.Hash(crypto.BLAKE2s_256.New()) + hashString = sequence.Hash(crypto.BLAKE2s_256.New()) case "BLAKE2b_256": - hashString = annotatedSequence.Hash(crypto.BLAKE2b_256.New()) + hashString = sequence.Hash(crypto.BLAKE2b_256.New()) case "BLAKE2b_384": - hashString = annotatedSequence.Hash(crypto.BLAKE2b_384.New()) + hashString = sequence.Hash(crypto.BLAKE2b_384.New()) case "BLAKE2b_512": - hashString = annotatedSequence.Hash(crypto.BLAKE2b_512.New()) + hashString = sequence.Hash(crypto.BLAKE2b_512.New()) case "BLAKE3": - hashString = annotatedSequence.Hash(blake3.New(32, nil)) - // hashString = annotatedSequence.Blake3Hash() + hashString = sequence.Hash(blake3.New(32, nil)) case "NO": - hashString = poly.RotateSequence(annotatedSequence.Sequence.Sequence) + hashString = poly.RotateSequence(sequence.Sequence) default: - hashString = annotatedSequence.Hash(blake3.New(32, nil)) + hashString = sequence.Hash(blake3.New(32, nil)) break } return hashString @@ -500,21 +499,21 @@ func getMatches(c *cli.Context) []string { } // function to parse whatever file is at a matched path. -func fileParser(c *cli.Context, match string) poly.AnnotatedSequence { +func fileParser(c *cli.Context, match string) poly.Sequence { extension := filepath.Ext(match) - var annotatedSequence poly.AnnotatedSequence + var sequence poly.Sequence - // determining which reader to use and parse into AnnotatedSequence struct. + // determining which reader to use and parse into Sequence struct. if extension == ".gff" || c.String("i") == "gff" { - annotatedSequence = poly.ReadGff(match) + sequence = poly.ReadGff(match) } else if extension == ".gbk" || extension == ".gb" || c.String("i") == "gbk" || c.String("i") == "gb" { - annotatedSequence = poly.ReadGbk(match) + sequence = poly.ReadGbk(match) } else if extension == ".json" || c.String("i") == "json" { - annotatedSequence = poly.ReadJSON(match) + sequence = poly.ReadJSON(match) } else { // TODO put default error handling here. } - return annotatedSequence + return sequence } // fileExists checks if a file exists and is not a directory before we diff --git a/poly/commands_test.go b/poly/commands_test.go index 77438f92..c501ea3e 100644 --- a/poly/commands_test.go +++ b/poly/commands_test.go @@ -57,7 +57,7 @@ func TestConvertPipe(t *testing.T) { pipeOutputTestSequence := poly.ParseJSON(writeBuffer.Bytes()) - if diff := cmp.Diff(baseTestSequence, pipeOutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(baseTestSequence, pipeOutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" { t.Errorf(" mismatch from convert pipe input test (-want +got):\n%s", diff) } @@ -83,7 +83,7 @@ func TestConvertFile(t *testing.T) { os.Remove("../data/puc19.json") // compared input gff from resulting output json. Fail test and print diff if error. - if diff := cmp.Diff(puc19InputTestSequence, puc19OutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(puc19InputTestSequence, puc19OutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" { t.Errorf(" mismatch from concurrent gbk input test (-want +got):\n%s", diff) } @@ -94,7 +94,7 @@ func TestConvertFile(t *testing.T) { os.Remove("../data/t4_intron.json") // compared input gbk from resulting output json. Fail test and print diff if error. - if diff := cmp.Diff(t4InputTestSequence, t4OutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentAnnotatedSequence")); diff != "" { + if diff := cmp.Diff(t4InputTestSequence, t4OutputTestSequence, cmpopts.IgnoreFields(poly.Feature{}, "ParentSequence")); diff != "" { t.Errorf(" mismatch from concurrent gbk input test (-want +got):\n%s", diff) } } @@ -170,7 +170,7 @@ func TestHashJSON(t *testing.T) { t.Fatalf("Run error: %s", err) } - hashOutputString := poly.ReadJSON("../data/puc19.json").Sequence.SequenceHash + hashOutputString := poly.ReadJSON("../data/puc19.json").SequenceHash os.Remove("../data/puc19.json") if hashOutputString != puc19GbkBlake3Hash { diff --git a/sequence.go b/sequence.go index 0c26b25b..efe1cdfc 100644 --- a/sequence.go +++ b/sequence.go @@ -42,21 +42,15 @@ var ComplementBaseRuneMap = map[rune]rune{ } // GetSequence is a method to get the full sequence of an annotated sequence -func (annotatedSequence AnnotatedSequence) GetSequence() string { - return annotatedSequence.Sequence.Sequence +func (sequence Sequence) GetSequence() string { + return sequence.Sequence } -// GetSequence is a method wrapper to get a Feature's sequence. Mutates with AnnotatedSequence. +// GetSequence is a method wrapper to get a Feature's sequence. Mutates with Sequence. func (feature Feature) GetSequence() string { return getFeatureSequence(feature, feature.SequenceLocation) } -// GetSequence is a method to get the full sequence of an unannotated sequence -// you could also just use sequence.sequence but I thought it was funny to include this. -func (sequence Sequence) GetSequence() string { - return sequence.Sequence -} - // ReverseComplement takes the reverse complement of a sequence func ReverseComplement(sequence string) string { complementString := strings.Map(ComplementBase, sequence) @@ -78,7 +72,7 @@ func ComplementBase(basePair rune) rune { func getFeatureSequence(feature Feature, location Location) string { var sequenceBuffer bytes.Buffer var sequenceString string - parentSequence := feature.ParentAnnotatedSequence.Sequence.Sequence + parentSequence := feature.ParentSequence.Sequence if len(location.SubLocations) == 0 { sequenceBuffer.WriteString(parentSequence[location.Start:location.End]) diff --git a/sequence_test.go b/sequence_test.go index 9458b090..cc7ffdd4 100644 --- a/sequence_test.go +++ b/sequence_test.go @@ -9,8 +9,8 @@ func TestGetSequenceMethods(t *testing.T) { gbk := ReadGbk("data/t4_intron.gb") // Check to see if GetSequence method works on Annotated struct - if gbk.GetSequence() != gbk.Sequence.Sequence { - t.Errorf(" AnnotatedSequence GetSequence method has failed'. Got this:\n%s instead of \n%s", gbk.GetSequence(), gbk.Sequence.Sequence) + if gbk.GetSequence() != gbk.Sequence { + t.Errorf(" Sequence GetSequence method has failed'. Got this:\n%s instead of \n%s", gbk.GetSequence(), gbk.Sequence) } // Check to see if GetSequence method works on Features struct @@ -21,8 +21,8 @@ func TestGetSequenceMethods(t *testing.T) { } // Check to see if GetSequence method works on Sequence struct - if gbk.Sequence.GetSequence() != gbk.Sequence.Sequence { - t.Errorf("Sequence GetSequence method has failed.. Got this:\n%s instead of \n%s", gbk.Sequence.GetSequence(), gbk.Sequence.Sequence) + if gbk.GetSequence() != gbk.Sequence { + t.Errorf("Sequence GetSequence method has failed.. Got this:\n%s instead of \n%s", gbk.GetSequence(), gbk.Sequence) } } diff --git a/transformations.go b/transformations.go index 512d673f..5160e8a7 100644 --- a/transformations.go +++ b/transformations.go @@ -209,12 +209,12 @@ func (codonTable CodonTable) generateTranslationTable() map[string]string { return translationMap } -// helper function to pull coding regions out of an AnnotatedSequence -func getCodingRegions(annotatedSequence AnnotatedSequence) string { - // pick out the each coding region in the AnnotatedSequence and add it to the sequence Builder +// helper function to pull coding regions out of an Sequence +func getCodingRegions(sequence Sequence) string { + // pick out the each coding region in the Sequence and add it to the sequence Builder var sequenceBuilder strings.Builder - for _, feature := range annotatedSequence.Features { + for _, feature := range sequence.Features { if feature.Type == "CDS" { sequenceBuilder.WriteString(feature.GetSequence()) } diff --git a/transformations_test.go b/transformations_test.go index 52c57635..e0362f4d 100644 --- a/transformations_test.go +++ b/transformations_test.go @@ -18,8 +18,8 @@ func TestOptimize(t *testing.T) { gfpTranslation := "MASKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKRHDFFKSAMPEGYVQERTISFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYITADKQKNGIKANFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*" codonTable := DefaultCodonTablesByNumber[11] - annotatedSequence := ReadGbk("data/bsub.gbk") - rawSequence := annotatedSequence.Sequence.Sequence + sequence := ReadGbk("data/bsub.gbk") + rawSequence := sequence.Sequence codonTable.CreateWeights(rawSequence) optimizedSequence := Optimize(gfpTranslation, codonTable)