BioPython AlignIO ValueError says strings must be same length? - python

Input fasta-format text file:
http://www.jcvi.org/cgi-bin/tigrfams/DownloadFile.cgi?file=/opt/www/www_tmp/tigrfams/fa_alignment_PF00205.txt
#!/usr/bin/python
from Bio import AlignIO
seq_file = open('/path/to/fa_alignment_PF00205.txt')
alignment = AlignIO.read(seq_file, "fasta")
Error:
ValueError: Sequences must all be the same length
The input sequences shouldn't have to be the same length since on ClustalOmega you can align sequences of differing lengths.
This also doesn't work...gets the same error:
alignment = AlignIO.parse(seq_file,"fasta")
for record in alignment:
print(record.id)
Does anybody who is familiar with BioPython know how to get around this to align sequences from fasta files?

Pad the sequence that is too short and write the records to to a temporary FASTA file. Than your alignments works as expected:
from Bio import AlignIO
from Bio import SeqIO
from Bio import Seq
import os
input_file = '/path/to/fa_alignment_PF00205.txt'
records = SeqIO.parse(input_file, 'fasta')
records = list(records) # make a copy, otherwise our generator
# is exhausted after calculating maxlen
maxlen = max(len(record.seq) for record in records)
# pad sequences so that they all have the same length
for record in records:
if len(record.seq) != maxlen:
sequence = str(record.seq).ljust(maxlen, '.')
record.seq = Seq.Seq(sequence)
assert all(len(record.seq) == maxlen for record in records)
# write to temporary file and do alignment
output_file = '{}_padded.fasta'.format(os.path.splitext(input_file)[0])
with open(output_file, 'w') as f:
SeqIO.write(records, f, 'fasta')
alignment = AlignIO.read(output_file, "fasta")
print alignment
This outputs:
SingleLetterAlphabet() alignment with 104 rows and 275 columns
TKAAIELIADHQ.......LTVLADLLVHRLQ..AVKELEALLA...QAL SP|A2VGF0.1/208-339
LQELASVINQHE...KV..MLFCGHGCR...Y..AVEEVMALAK...EDL SP|A3D4X6.1/190-319
IKKIAQAIEKAK...KP..VICAGGGVINS.N..ASEELLTLSR...KEL SP|A3DID9.1/192-327
IDEAAEAINKAE...RP..VILAGGGVSIA.G..ANKELFEFAT...QLL SP|A3DIY4.1/192-327
IEKAIELINSSQ...RP..FICSGGGVISS.E..ASEELIQFAE...KIL SP|A4XHS0.1/191-326
IKRAVEAIENSQ...RP..VICSGGGVIAS.R..ASDELKILVE...SEI SP|A4XIL5.1/194-328
VRQAARIIMESE...RP..VIYAGGGVRIS.G..AAPELLELSE...RAL SP|A5D4V9.1/192-327
LQALAQRILRAQ...RP..VIITGDEIVKS.D..ALQAAADFAS...LQL SP|A5ECG1.1/192-328
VEKAVELLWSAR...RV..LVISGRGAR...G..AGPELIGLLD...RAM SP|A5EDH4.1/198-324
IQKAARLIETAE...KP..VIIAGHGVNIS.G..ANEELKTLAE...KSL SP|A5FR34.1/193-328
LDALARDLDSAA...RV..TIYAGIGAR...G..AAARVVQLAG...EAL SP|A5FTR0.1/189-317
VADVAALLRAAR...RP..VIVAGGGVIHSG...AEERLATFAA...DAL SP|A5G0X6.1/217-351
IAEAVSALKGAK...RP..IIYTGGGLINS.GPESAELIVQLAK...RAL SP|A5G2E1.1/199-336
LKKAAEIINRAK...RP..LIYAGGGITLA.G..ASAELRALAA...ALL SP|A5GC69.1/192-327
CRDIVGKLLQSH...RP..VVLGGTGVRLS.R..TEQRLLALVE...DVF SP|A5W0I1.1/200-336
LDQAALKLAAAE...RP..MIIAGGGA..L.H..AAEQLAQLSA...AGL SP|A5W220.1/196-326
LQRAADILNTGH...KV..AILVGAGAL...Q..ATEQVIAIAE...RAL SP|A5W364.1/198-328
IRKAAEMLLAAK...RP..VVYSGGGVILG.G..GSEALTEIAK...SEM SP|A5W954.1/196-331
...
LTELQERLANAQ...RP..VVILGGSRWSD.A..AVQQFTRFAE...... SP|Q220C3.1/190-328

your problem is last record of fasta ... tail -9 fa_alignment_PF00205.txt
>SP|Q21VK8.1/229-357
LQAALAALAKAE...RP..LLVIGSQALVLSK..QAEHLAEAVARL.GIPV.YLSGMA..RGLLG.R..........DH.
...............PLQ..................MRHQRRQALRE..ADCVLLAG.VP...CDFRLD......YGKHV
RR..............S.AT.........L..IAA.N......................RSA.........KDARLNR..
.......K...PD.IAAIGDAG.......LFLQAL
>SP|Q220C3.1/190-328
LTELQERLANAQ...RP..VVILGGSRWSD.A..AVQQFTRFAEAF.SLPV.FCSFRR..QMLFS.A..........NH.
...............ACY...AG.DLGLG.A.....NQRLLARI.RQ..SDLILLLG.GR...MSEVPS......QGYEL
LGIPAPQQ...........D
Sequence with id SP|Q220C3.1/190-328 has different length than other sequences

Related

BioPython AlignIO sequences must be the same length [multiple files]

I got an issue when I try to align multiple files
Here is my script:
from Bio import AlignIO
from Bio.Align import MultipleSeqAlignment
from Bio.Align.Applications import ClustalOmegaCommandline
def divergence(fic1dna,fic2dna,fic1prot,fic2prot):
from Bio import SeqIO
seq1dna = list(SeqIO.parse(fic1dna, "fasta",alphabet=IUPAC.IUPACUnambiguousDNA()))
seq2dna = list(SeqIO.parse(fic2dna, "fasta",alphabet=IUPAC.IUPACUnambiguousDNA()))
seq1prot = list(SeqIO.parse(fic1prot, "fasta",alphabet=IUPAC.protein))
seq2prot= list(SeqIO.parse(fic2prot, "fasta",alphabet=IUPAC.protein))
u=0
while u < len(seq1dna): # make an alignment betwen each element on 2 files for 2 paires files
nuc1=str(seq1dna[u].seq)
nuc2=str(seq2dna[u].seq)
prot1=str(seq1prot[u].seq)
prot2=str(seq2prot[u].seq)
prot1 = SeqRecord(Seq(prot1, alphabet=IUPAC.protein),id='pro1')
prot2 = SeqRecord(Seq(prot2, alphabet=IUPAC.protein),id='pro2')
aln = MultipleSeqAlignment([prot1, prot2])
print(aln)
u+=1
print(divergence("concatenate_0035_fna_renamed.fst","concatenate_0042_fna_renamed.fst","concatenate_0035_faa_renamed.fst","concatenate_0042_faa_renamed.fst"))
So, as you can see I have 4 files, corresponding to 244 sequences from 2 species and I need to calculate dN dS for each of them so, I need to align each paired seq in codon alignment.
But, when I'm trying to align my 244 protein sequences, the error " ValueError("Sequences must all be the same length") raises "
I do not know why the script does not accept sequence with different length since all other programmes do.
short input would be :
one file with the AA seq from the sp 1
>EOG090X005Q
CEHNTAGRDCEKCLDFYNDAPWGRASPTNVHECKACNCNGFSNKCYFDKDLYERTGHGGHCIDCEENRDGANCERCKENFYQGMEDICLPCNCNPTGSRSLQCNAEGKCQCKPGVTGDKCDVCAPNYFEFTMHGCKPCDCNVSGSYGNTPQCDPQTGVCLCKQNVEGRRCRECKPGFFNLDVENEFGCTPCFCFGHSSQCSSAPKYQAHEISAHYIRDAEKWGAEDDQRKPVQLQFNANTQNIAVASKGSEILYFLASGQFLGDQRPSYNHDLKFTLRLGESGGYPSSQDIILEGARSSVSMNIYAQNNPEPSDVAQEYSFRLHEDPRYGWTPTLSNFEFMSILQNLTAIKIRGTYNKGGVGYLINFKLETAKIGREKGSAPANWVEKCSCPKAYVGDYCEECAPGYKHEPANGGPYSTCIPCDCNGHAHICDTATGFCICKHNTTGSNCELCAKGFYGNAIAGTADDCKPCPCPKDSGCIQLMDQSIVCTDCPVGYAGPRCEVCADAHFGDPTGQFGAPQECEECQCNGNVDPNAVGNCNRTTGECLKCIYNTAGEHCDKCLSGYFGDALDQKKKGDCKPCQCLEAGTVESPEGARKAPLCDGLTGFCSCRPHVIGRNCDKCEVDLNCIAVLKT
>EOG090X00BV
MNAHFPQNEIARSEAYNIMSVRKQYLVPKDGTPLSGLIQDHVISGVKMSIRGAFFTKADYQQLVFQALSNHKGEIKLLPPTILKPIMLWSGKQILSTIIINSIPKGKPYLSLTGKAKISSKAWQKEPARTWNAGGTPFTNPNSMSEAEVIIRKGELLCGVLDKTHYGATPYGLVHCMYELYGGDSSSALLSSFSKVFTFYLQWIGFTLGVKDILVVEEADKQRDNFINLVRKVGKVAAAKATELPVDVDELKLKETISEMLIKDPKFRANLDRQYKSLLDSYTNNINTVCLSEGLLEKFPYNNLQLMVQSGAKGSTVNTMQISCLLGQIELEGKRPPLMISGRSLPSFPPYDISPRAGGFIDGRFMTGIQPQEFFFHCMAGREGLIDTAVKTSRSGYLQRCLIKHLEGLSVAYDHTVRDSDSSVIQFAYGEDGLDVIKCQYFNKDQFEFLDVNSNAVISKSAIKKLKEDDKSKALAKSQKSLKKWKKKNGNPFEKVRYSPFTEFSAIAKNDIVLDDKPTDQTRDPNYWELEKMWRNLDADEKKQYARKRCPDPIPSKYSPEYKFGVINEQLNELTQNYLKNRKEHMYSDYTDKDKFTEIINAKYLASMAAPGEPVGLLAAQSIGEPSTQMTLNTFHFAGRGDMNVTLGIPRLREILMTASAKLKTPSMDIPFRSDLPDLNKKAERLRQKMNRVTVSDVLEKIDVHCEIVTNPNRQLKTVMRFSFLPHSQYKVQYTVKPAQIIKHMQNKFFSEMFSIIRKQAKTTCGVMWSTEKEKKRRAASDEDDEDGEGASPDVAEKAVNMDEDSSDEEGPNDDDDNTDVS
and the other for the specie 2:
>EOG090X005Q
MGGKIAAILLFAFFTSGSRSEPDFVDGQFNKINKNRVEVKCYDDFGAPQRCIPPFENAAFGVLMEATNTCGQDGRPTEFCRQTGVQRKPCEFCHPGDHPASFLTDRDNNDNATWWQSETMHEGIEYPNKVVLTLNLGKTYDITYVRVLFESPRPESWGIFRRRTEDSPWEPYQFYSATCRDTYGLPDRKDTVRGEDTRVLCTSEYSDISPLRRGTVAFSTLEGRPSAFQFDTNPALQSWVQATDLRLSLDRPNTFGDELFGDGQVLKSYYYAIADVAVGARCACNGHAGECINSPHTNGTTRRVCRCEHNTAGPDCNECLPFYNDAPWGRATTTDAHECKPCNCNGYSDRCYFDKDLYERSGHGGHCTDCRANRAGPNCERCRENFYQRLEDSYCVACNCNEIGSRSLQCNSEGKCQCKPGITGDKCDRCAANFFNFDSLGCTSCECSPKGSLDNEPNCDPVSGACVCKENVEGKRCRECRPGFFNLDLDNEFGCTPCFCYGHSSVCNLANGYSKLTIESMFGRGNEKWTASVAGNPIPLHYDAVTQTISVNAPDRDNVYFVAPERFLGDQRASYNQDLTFTLRIAENEPAPTARDVILEGGNGEQLTQPIFGQTNQLPNASPQVYKFRLNEHADYGWEPRVTSRAFMSVLSNLTAIKIRGTYTHQGRGFLDDVSLETAQRGAAGEPADWIEHCQCPHGYVGQFCESCAPGFHHDPPNGGPFSLCVPCNCNGHADICEAETGQCICHHNTAGSNCDLCSRGFYGYPLKGTPHDCKPCPCPDNGPCILLGNNPDPICSECPSGRTGARCETCSDGYFGNPDQGQACRLCDCNNNIDLNAVRNCNHETGECLKCVNNTAGFHCEDCLSGYWGDALSERKEDSCKLCQCYPPGTIELDDGSVAPCNQLTGHCACKPHVIGRNCDKCEDGYYQILSGDGCTACNCDPEGSYNRTCDATTGQCECRPGITGKRCDTCLPYQFGFGRDGCKHCDCDTIGSQELQCDASGQCPCLTNVEGRRCDRCKENKYNRQYGCIDCPPCYNLIQDSVNQHRRRLNELESTLRKINNSPTVMKDSDFEKELKNVENRVKSLLQVAKQGSGNENKTLVEQLDELRDQLNQIEKISQSVDATAEDARRTTNEGLTSIEEAERVLDQIYEQLTEAEDYLATDGARALAAAKKRADQVGQQNQQMTIIAQEARVLADLNTNEAKKIHVLAEQARNTSLEAYNLAKKAIAKYSNISDEIRGLENKLELLEDRFNEVKNLTAAAVAKSAAVDKEALQLLILDLRVPAVDTNELRILLETVSVDGSEIKEQAQLLLGQNEAWLNELANKARKSEELLERAQDQQAATADLLSEVDGANEKAKDALKRGNQTLVEAQETLKKLGEFDAEVQKERIKAQEALTVLEEIKDMVNEAIAKANETESVLKDAESNAIAAKDIAIQAQVSNNADEASANANLIRQEANKTKLDAVRLGNEADKLHLRVEITNSIAKKHEARVDKDVNATNEVNHQVGQARNSLNLAGQQVDKALAEVDEIIKELDVLPEIDDADLDRLEERLLAAEKEIEEANLEKRIRELTEAKNLQTQWVKNYEDEVSRLRLEVENIDDIRKALPSICYKRLRLEP
>EOG090X00BV
MFSIFTASDVRNLSVLKISTPLSFNILGHPLKGGLYDPALGPLNDRSDPCGTCGEGTIQCMGHFGHIELPVPVVNPLFHKVLTSLLKLSCLKCYTLQIPSYLKLLLNGKLRLMEEGFSNDIPGLEQEVGSAVAGMNRIAEGELEFISDIIEAYIEMTCNQRHHVQSGKSKESTSTRTLNMEWHHYIESVVKTCKASKLCINCRNPIPKMTILKNKILTNHVVNNEDTMMEDRVIHKLETSFMTPDQSKKHLRGLWQKEADILRIIIPCLGSVDLEFPTDVFFFEIIPVLPPITRPVNMLDNQLVEHPQSQVYKSIIQDCLVLRNIIQTIQDGDTTQLPEEGRAVFDEIRGDNAAEKLHHAWTTLQSNVDHLMDREMSKTTESANCHGLKQVIEKKEGIIRMHMMGKRVNYAARSVITPDPNLNIDEIGVPEAFALKLTYPVPVTPWNVTELRKLIINGPEIHPGAVMIEGEDGFVKLLRGDDKTQLEAIAKRLLTSSRKPFSGIKIVHRHLQNGDMLLLNRQPTLHKPSIMAHKARILKGEKTLRLHYANCKAYNADFDGDEMNAHFPQNELARSEGYFIANVSNQYLVPKDGTPLGGLIQDHVISGVRLTLRGNFFNRQDYMQLVYSAIADTTGDLILLPPTILKPVRLWSGKQIISTVIINLTPRGRAPINLKASAKISVKDWQVKKARKWKCGQEFTDQRTMSEAEVVIRGGELLSGVLDKTHYGATPYGLIHCLFELYGGTCSSKVLSAFGKLFQTYLQISGFTLGVEDILVVRKSDQKRREIIEACRQIGDQIQTATVELPPGTSEEQVKSKMEESYAKDPKFRAIVDRKYKSALDVFTNNINKTCLPAGLLKKFPHNNLQLMVQSGAKGSTVNTMQISCLLGQIELEGKRPPLMINGKSLPSFPAYDSSPRSGGFIDGRFMTGIQPQEFFFHCMAGREGLIDTAVKTSRSGYLQRCLIKHLEGLTVNYDSTVRDSDGSLIQMSYGEDGLDIPNSRFLRKEELDFLVENRKAIVDPALVEHLKDETTEKIRKINKKIRKWRTKHGNGSTKWRNSEFAKFSEINRNSGSSKNRQINSNCGRTKAALSLMKKWIRADEEVKKKLKDECVRCPDPVTSIFRQDLQFGVLTEKMEALMEEYLDEKSRRFTTSIGKEEVRDLLCTKIMKSLCPPGEPVGLLAAQSIGEPSTQMTLNTFHFAGRGEMNVTLGIPRLREILMMASKNIKTPSMEIPFRTDLPNVENQATKLQLKLTKCYLSNILKNIKLDRKLEENPNRQLTFTLTVNCLPHKFYKNEYCVKPHNVLNEIERNFFKLFFRAIKKIGKATGTLLHIEEEKSSSREDDAMLDTGEPDETEAKPNRSDLGELHESSDEDEAAEDADATASRSIARHRENQEYEDPEEEEIEDAAPREPEDEENPQNPTNLPPEDEDDLDQPMCVADELITEQRKKDVVNMHPYALDYDYDSEKFLWCKLTFWLPLRMCRLDLPTILRTVAEKVVLWETPAIKRAFTFQNSEGETILKTDGLNIVEMFKYAQILDLHKLYTNDIYGVSRTYGIEAANRVILKEVKDVFKMYGITVDSRHLSLIADYMTFDGTFQPLSRKGMEDSASPLQQMSFEASLNFLKNATLQGKHDDLMSPSSRLMVGQPCKTGTGAFNVLFKMNNTAVSM
Someone could help me?
Thanks you

Python splitting with string as delimiter

I have a file that looks something like this:
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC
And I need to split it into the "non-N" sequences, so two separate files like this:
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCT
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGC
TATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACA
TGTGTTGC
What I currently have is this:
UMfile = open ("C:\Users\Manuel\Desktop\sequence.txt","r")
contignumber = 1
contigfile = open ("contig "+str(contignumber), "w")
DNA = UMfile.read()
DNAstring = str(DNA)
for s in DNAstring:
DNAstring.split("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN",1)
contigfile.write(DNAstring)
contigfile.close()
contignumber = contignumber+1
contigfile = open ("contig "+str(contignumber), "w")
The thing is that I realize there is a linebreak between the "Ns" and that is why it is not splitting my file, but the "file" I'm showing is just a part of a much much bigger one. So sometimes the "Ns" will look like this "NNNNNN\n" and sometimes like "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN\n", yet there is always a count of 1000 Ns between my sequences that I need to split.
So my question is: How do I tell python to split and wite into different files every 1000xNs knowing that there will be different number of Ns in each line?
Thank you all very much, I really have no informatics background and my python skills are at best basic.
Just split your string on 'N' and then remove all the strings that are empty, or just contain a newline. Like this:
#!/usr/bin/env python
DNAstring = '''AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACT
TCTCAATGGGCAGTACATATCATCTCTNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNNNNNNAAAACGTGTGCATGAACAAAAAA
CGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCAC
TAAATCAACGGACATGTGTTGC'''
sequences = [u for u in DNAstring.split('N') if u and u != '\n']
for i, seq in enumerate(sequences):
print i
print seq.replace('\n', '') + '\n'
output
0
AAACAACAGGGTACAAAGAGTCACGCTTATCCTGTTGATACTTCTCAATGGGCAGTACATATCATCTCT
1
AAAACGTGTGCATGAACAAAAAACGTAGCAGATCGTGACTGGCTATTGTATTGTGTCAATTTCGCTTCGTCACTAAATCAACGGACATGTGTTGC
The code snippet above also removes newlines inside the sequences using .replace('\n', '').
Here are a few programs that you may find useful.
Firstly, a line buffer class. You initialise it with a file name and a line width. You can then feed it random length strings and it will automatically save them to the text file, line by line, with all lines (except possibly the last line) having the given length. You can use this class in other programs to make your output look neat.
Save this file as linebuffer.py to somewhere in your Python path; the simplest way is to save it wherever you save your Python programs and make that the current directory when you run the programs.
linebuffer.py
#! /usr/bin/env python
''' Text output buffer
Write fixed width lines to a text file
Written by PM 2Ring 2015.03.23
'''
class LineBuffer(object):
''' Text output buffer
Write fixed width lines to file fname
'''
def __init__(self, fname, width):
self.fh = open(fname, 'wt')
self.width = width
self.buff = []
self.bufflen = 0
def write(self, data):
''' Write a string to the buffer '''
self.buff.append(data)
self.bufflen += len(data)
if self.bufflen >= self.width:
self._save()
def _save(self):
''' Write the buffer to the file '''
buff = ''.join(self.buff)
#Split buff into lines
lines = []
while len(buff) >= self.width:
lines.append(buff[:self.width])
buff = buff[self.width:]
#Add an empty line so we get a trailing newline
lines.append('')
self.fh.write('\n'.join(lines))
self.buff = [buff]
self.bufflen = len(buff)
def close(self):
''' Flush the buffer & close the file '''
if self.bufflen > 0:
self.fh.write(''.join(self.buff) + '\n')
self.fh.close()
def testLB():
alpha = 'abcdefghijklmnopqrstuvwxyz'
fname = 'linebuffer_test.txt'
lb = LineBuffer(fname, 27)
for _ in xrange(30):
lb.write(alpha)
lb.write(' bye.')
lb.close()
if __name__ == '__main__':
testLB()
Here is a program that makes random DNA sequences of the form you described in your question. It uses linebuffer.py to handle the output. I wrote this so I could test my DNA sequence splitter properly.
Random_DNA0.py
#! /usr/bin/env python
''' Make random DNA sequences
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Takes approx 3 seconds per megabyte generated and saved
on a 2GHz CPU single core machine.
Written by PM 2Ring 2015.03.23
'''
import sys
import random
from linebuffer import LineBuffer
#Set seed to None to seed randomizer from system time
random.seed(37)
#Output line width
linewidth = 120
#Subsequence base length ranges
minsub, maxsub = 15, 300
#Subsequences per sequence ranges
minseq, maxseq = 5, 50
#random 'N' sequence ranges
minn, maxn = 5, 200
#Probability that a random 'N' sequence occurs after a subsequence
randn = 0.2
#Sequence separator
nsepblock = 'N' * 1000
def main():
#Get number of sequences from the command line
numsequences = int(sys.argv[1]) if len(sys.argv) > 1 else 2
outname = 'DNA_sequence.txt'
lb = LineBuffer(outname, linewidth)
for i in xrange(numsequences):
#Write the 1000*'N' separator between sequences
if i > 0:
lb.write(nsepblock)
for j in xrange(random.randint(minseq, maxseq)):
#Possibly make a short run of 'N's in the sequence
if j > 0 and random.random() < randn:
lb.write(''.join('N' * random.randint(minn, maxn)))
#Create a single subsequence
r = xrange(random.randint(minsub, maxsub))
lb.write(''.join([random.choice('ACGT') for _ in r]))
lb.close()
if __name__ == '__main__':
main()
Finally, we have a program that splits your random DNA sequences. Once again, it uses linebuffer.py to handle the output.
DNA_Splitter0.py
#! /usr/bin/env python
''' Split DNA sequences and save to separate files
Sequences consist of random subsequences of the letters 'ACGT'
as well as short sequences of 'N', of random length up to 200.
Exactly 1000 'N's separate sequence blocks.
All sequences may contain newlines chars
Written by PM 2Ring 2015.03.23
'''
import sys
from linebuffer import LineBuffer
#Output line width
linewidth = 120
#Sequence separator
nsepblock = 'N' * 1000
def main():
iname = 'DNA_sequence.txt'
outbase = 'contig'
with open(iname, 'rt') as f:
data = f.read()
#Remove all newlines
data = data.replace('\n', '')
sequences = data.split(nsepblock)
#Save each sequence to a series of files
for i, seq in enumerate(sequences, 1):
outname = '%s%05d' % (outbase, i)
print outname
#Write sequence data, with line breaks
lb = LineBuffer(outname, linewidth)
lb.write(seq)
lb.close()
if __name__ == '__main__':
main()
assuming you can read the whole file at once
s=DNAstring.replace("\n","") # first remove the nasty linebreaks
l=[x for x in s.split("N") if x] # split and drop empty lines
for x in l: # print in chunks
while x:
print x[:10]
x=x[10:]
print # extra linebreak between chunks
You could simply replace every N and \n with a space, and then split.
result = DNAstring.replace("\n", " ").replace("N", " ").split()
This will give you back a list of strings, and the 'ACGT' sequences will also be split with every new line.
if this is not you goal an you want to conserve the \n in the 'ACGT' and not split along it, you can do the following:
result = DNAstring.replace("N\n", " ").replace("N", " ").split()
this will only remove the \n if it is in the middle of an N sequence.
To split your string exactly after 1000 Ns:
# 1/ Get rid of line breaks in the N sequence
result = DNAstring.replace("N\n", "N")
# 2/ split every 1000 Ns
result = result.split(1000*"N")

Biopython Large Sequence splitting

I'm a newbie in the field of python programming. As I was trying to do some analysis,(I've tried to find the answer on other posts, but nothing) I decided to post my first and probably very foolish question. Why does this create only one output file although in this example there were supposed to be at least 8 (sequence is more than 8000 characters).
Thank you for your answer upfront.
def batch_iterator(iterator, batch_size) :
entry = True
while entry :
batch = []
while len(batch) < batch_size :
try :
entry = iterator.next()
except StopIteration :
entry = None
if entry is None :
#End of file
break
batch.append(entry)
if batch :
yield batch
from Bio import SeqIO
record_iter = SeqIO.parse(open("some.fasta"),"fasta")
for i, batch in enumerate(batch_iterator(record_iter, 1000)) : #I think sth is wrong here?
filename = "group_%i.fasta" % (i+1)
handle = open(filename, "w")
count = SeqIO.write(batch, handle, "fasta")
handle.close()
print "Wrote %i records to %s" % (count, filename)
Sequence chunks
After a long discussion with the OP, here is my very restructured proposal, using the generator function defined in this other SO thread
# file: main.py
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in xrange(0, len(l), n):
yield l[i:i+n]
if __name__ == '__main__':
handle = open('long.fasta', 'r')
records = list(SeqIO.parse(handle, "fasta"))
record = records[0]
for pos, chunk in enumerate(chunks(record.seq.tostring(), 1000)):
chunk_record = SeqRecord(Seq(
chunk, record.seq.alphabet),
id=record.id, name=record.name,
description=record.description)
outfile = "group_%d.fasta" % pos
SeqIO.write(chunk_record, open(outfile, 'w'), "fasta")
Note that your original code does something very different: it takes new records from the generator provided by the SeqIO.parse function, and tries to store them in different files. If you want to split a single record in smaller sub-sequences, you have to access the record's internal data, which is done by record.seq.tostring(). The chunks generator function, as described in the other thread linked above, returns as many chunks as is possible to build from the passed in sequence. Each of them is stored as a new fasta record in a different file (if you want to keep just the sequence, write the chunk directly to the opened outfile).
Check that it works
Consider the following code:
# file: generate.py
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
from Bio import SeqIO
long_string = "A" * 8000
outfile = open('long.fasta', 'w')
record = SeqRecord(Seq(
long_string,
IUPAC.protein),
id="YP_025292.1", name="HokC",
description="toxic membrane protein, small")
SeqIO.write(record, outfile, "fasta")
It writes a single record to a file named "long.fasta". This single record has a Sequence inside that is 8000 characters long, as generated in long_string.
How to use it:
$ python generate.py
$ wc -c long.fasta
8177 long.fasta
The overhead over 8000 characters is the file header.
How to split that file in chunks of 1000 length each, with the code snippet above:
$ python main.py
$ ls
generate.py group_1.fasta group_3.fasta group_5.fasta group_7.fasta main.py
group_0.fasta group_2.fasta group_4.fasta group_6.fasta long.fasta
$ wc -c group_*
1060 group_0.fasta
1060 group_1.fasta
1060 group_2.fasta
1060 group_3.fasta
1060 group_4.fasta
1060 group_5.fasta
1060 group_6.fasta
1060 group_7.fasta
8480 total

How to create a new list or new line after a certain number of iterations

I have a text file that I am parsing one column of data from and the result is one big list (50 elements):
CLB, HNRG, LPI, MTDR, MVO, NRGY, PSE, PVR, RRC, WES, ACMP, ATLS, ATW, BP, BWP, COG, DGAS, DNR, EPB, EPL, EXLP, NOV, OIS, PNRG, SEP, APL, ARP, CVX, DMLP, DRQ, DWSN, EC, ECA, FTI, GLOG, IMO, LINE, NFX, OILT, PNG, QRE, RGP, RRMS, SDRL, SNP, TLP, VNR, XOM, XTXI, AHGP
Now, after every 10 elements in that list, I want a new line. So the way I though to approach it is after every 10 commas split the list into a new line, here is my approach:
import csv
import re
filename = input("Please enter file name to extract data from: ")
with open(filename) as f:
next(f)
data = f.readlines()
my_list2 = []
ticker_list = []
for line in data:
my_list = line.split()
my_list2.append(my_list[1])
for item in my_list2:
ticker_list = ', '.join(my_list2)
count = 0
for item in ticker_list:
if item == ",":
count += 1
if count == 10:
ticker_list = [i.split('\n')[0] for i in ticker_list]
print (ticker_list)
##with open("ticker_data.txt", "w") as file:
## file.write(', '.join(ticker_list))
But it doesn't seem to work, does anyone have a solution for me that will give me this result in a txt file:
CLB, HNRG, LPI, MTDR, MVO, NRGY, PSE, PVR, RRC, WES,
ACMP, ATLS, ATW, BP, BWP, COG, DGAS, DNR, EPB, EPL,
EXLP, NOV, OIS, PNRG, SEP, APL, ARP, CVX, DMLP, DRQ,
DWSN, EC, ECA, FTI, GLOG, IMO, LINE, NFX, OILT, PNG,
QRE, RGP, RRMS, SDRL, SNP, TLP, VNR, XOM, XTXI, AHGP
Thanks, I'm using Python 3 by the way..
Ok Using a file called rawdata.txt that looks like this:
CLB, HNRG, LPI, MTDR, MVO, NRGY, PSE, PVR, RRC, WES, ACMP, ATLS, ATW, BP, BWP, COG, DGAS, DNR, EPB, EPL, EXLP, NOV, OIS, PNRG, SEP, APL, ARP, CVX, DMLP, DRQ, DWSN, EC, ECA, FTI, GLOG, IMO, LINE, NFX, OILT, PNG, QRE, RGP, RRMS, SDRL, SNP, TLP, VNR, XOM, XTXI, AHGP
Here is a script that reads each line and splits it into rows wih to more than 10 symbols per row
import csv
with open('rawdata.txt') as f:
with open('ticker_data.csv', 'wb') as csvfile:
writer = csv.writer(csvfile)
for line in f.readlines():
data = line.split(', ')
chunks=[data[x:x+10] for x in xrange(0, len(data), 10)]
for chunk in chunks:
writer.writerow(chunk)
Which produces a file with this in it:
CLB,HNRG,LPI,MTDR,MVO,NRGY,PSE,PVR,RRC,WES
ACMP,ATLS,ATW,BP,BWP,COG,DGAS,DNR,EPB,EPL
EXLP,NOV,OIS,PNRG,SEP,APL,ARP,CVX,DMLP,DRQ
DWSN,EC,ECA,FTI,GLOG,IMO,LINE,NFX,OILT,PNG
QRE,RGP,RRMS,SDRL,SNP,TLP,VNR,XOM,XTXI,AHGP
You could do this:
import csv
from itertools import izip_longest
with open('/tmp/line.csv','r') as fin:
cr=csv.reader(fin)
n=10
data=izip_longest(*[iter(list(cr)[0])]*n,fillvalue='')
print '\n'.join(', '.join(t) for t in data)
With your data, prints:
CLB, HNRG, LPI, MTDR, MVO, NRGY, PSE, PVR, RRC, WES
ACMP, ATLS, ATW, BP, BWP, COG, DGAS, DNR, EPB, EPL
EXLP, NOV, OIS, PNRG, SEP, APL, ARP, CVX, DMLP, DRQ
DWSN, EC, ECA, FTI, GLOG, IMO, LINE, NFX, OILT, PNG
QRE, RGP, RRMS, SDRL, SNP, TLP, VNR, XOM, XTXI, AHGP
Edit
With the clarification (Py 3)
I would write your program thissa way:
import csv
from itertools import zip_longest
n=10
with open('/tmp/rawdata.txt','r') as fin, open('/tmp/out.csv','w') as fout:
reader=csv.reader(fin)
writer=csv.writer(fout)
source=(e for line in reader for e in line)
for t in zip_longest(*[source]*n):
writer.writerow(list(e for e in t if e))
Changes:
Output is to a file;
Source of elements is a generator;
No matter how many lines or comma separated elements per line, the source is treated item by item (subject to csv/element considerations);
No matter what n is, the output is n elements long until there is the last bit < n
Another option is to use slices and xrange:
import csv
writer = csv.writer(open("output.txt", "w"))
for x in xrange(0,len(ticker_list),10):
writer.writerow(ticker_list[x:x+10])
xrange gives us the numbers between 0 and the length of list with step size 10, then we print out a slice of length 10 starting at each of these indicies to csvfile. csv.writer will take care of adding the comma delimiters etc.

"list index out of range" in python

I have a code in python to index a text file that contain arabic words. I tested the code on an english text and it works well ,but it gives me an error when i tested an arabic one.
Note: the text file is saved in unicode encoding not in ANSI encoding.
This is my code:
from whoosh import fields, index
import os.path
import csv
import codecs
from whoosh.qparser import QueryParser
# This list associates a name with each position in a row
columns = ["juza","chapter","verse","voc"]
schema = fields.Schema(juza=fields.NUMERIC,
chapter=fields.NUMERIC,
verse=fields.NUMERIC,
voc=fields.TEXT)
# Create the Whoosh index
indexname = "indexdir"
if not os.path.exists(indexname):
os.mkdir(indexname)
ix = index.create_in(indexname, schema)
# Open a writer for the index
with ix.writer() as writer:
with open("h.txt", 'r') as txtfile:
lines=txtfile.readlines()
# Read each row in the file
for i in lines:
# Create a dictionary to hold the document values for this row
doc = {}
thisline=i.split()
u=0
# Read the values for the row enumerated like
# (0, "juza"), (1, "chapter"), etc.
for w in thisline:
# Get the field name from the "columns" list
fieldname = columns[u]
u+=1
#if isinstance(w, basestring):
# w = unicode(w)
doc[fieldname] = w
# Pass the dictionary to the add_document method
writer.add_document(**doc)
with ix.searcher() as searcher:
query = QueryParser("voc", ix.schema).parse(u"بسم")
results = searcher.search(query)
print(len(results))
print(results[1])
Then the error is :
Traceback (most recent call last):
File "C:\Python27\yarab.py", line 38, in <module>
fieldname = columns[u]
IndexError: list index out of range
this is a sample of the file:
1 1 1 كتاب
1 1 2 قرأ
1 1 3 لعب
1 1 4 كتاب
While I cannot see anything obviously wrong with that, I would make sure you're designing for error. Make sure you catch any situation where split() returns more than expected amount of elements and handle it promptly (e.g. print and terminate). It looks like you might be dealing with ill-formatted data.
You missed the header of Unicode in your script. the first line should be:
encoding: utf-8
Also to open a file with the unicode encoding use:
import codecs
with codecs.open("s.txt",encoding='utf-8') as txtfile:

Categories

Resources