I actually have a dataframe such:
old_name new_name pident length
gene1_0035_0042 geneA 100 560
gene2_0035_0042 geneA 100 545
gene3_0042_0035 geneB 99 356
gene4_0042_0035 geneB 97 256
gene6_0035_0042 geneB 96 567
and here is the fasta file (example):
>gene1_0035_0042
ATTGAC
>gene2_0035_0042
ATGAGCC
>gene3_0042_0035
AGCCAG
>gene4_0042_0035
AGCCAT
>gene6_0035_0042
AGCCATG
in fact I wrote a script to replace in a fasta file the old_name of the sequence by the new_name by doing: (qseqid = old_name and Busco_ID = new_names in the ex).
blast=pd.read_table("matches_Busco_0035_0042_best_hit.m8",header=None)
blast.columns = ["qseqid", "Busco_ID", "pident", "length", "mismatch", "gapopen","qstart", "qend", "sstart", "send", "evalue", "bitscore"]
repl = blast[blast.pident > 95]
repl.to_csv("busco_blast_non-rename.txt",sep='\t')
qseqid=repl.ix[:,0]
Busco_ID=repl.ix[:,1]
newfile = []
count = 0
running_inds = {}
for rec in SeqIO.parse("concatenate_0035_0042_dna2.fa", "fasta"):
#get corresponding value for record ID from dataframe
#repl["seq"] and "newseq" are the pandas column with the old and new sequence names, respectively
x = repl.loc[repl["qseqid"] == rec.id, "Busco_ID"]
#change record, if not empty
if x.any():
#append old identifier number to the new id name
running = running_inds.get(x.iloc[0], 1) # Get the running index for this sequence
running_inds[x.iloc[0]] = running + 1
rec.name = rec.description = rec.id = x.iloc[0] + rec.id[rec.id.index("_"):]
count += 1
#append record to list
newfile.append(rec)
#write list into new fasta file
SeqIO.write(newfile, "concatenate_with_busco_names_0035_0042_dna.fa", "fasta")
#tell us, how hard you had to work for us
print("I changed {} entries!".format(count))
as you can see I only filter my sequence by keeping these with a pident > 95 but as you can see I will get for all these sequences the same name which is the new_name but instead of that, I would like to add a number at the end of the new name. For the above example it would give in the fasta file:
>geneA_0035_0042_1
ATTGAC
>geneA_0035_0042_2
ATGAGCC
>geneB_0042_0035_1
AGCCAG
>geneB_0042_0035_2
AGCCAT
>geneB_0035_0042_1
AGCCATG
and so on
instead of:
>geneA_0035_0042
ATTGAC
>geneA_0035_0042
ATGAGCC
>geneB_0042_0035
AGCCAG
>geneB_0042_0035
AGCCAT
>geneB_0035_0042
AGCCATG
as my script does
Thanks for your help
Issue:
I got:
>EOG090X0FA0_0042_0042_1
>EOG090X0FA0_0042_0035_2
>EOG090X0FA0_0035_0035_3
>EOG090X0FA0_0035_0042_4
but since they are all different I should get:
>EOG090X0FA0_0042_0042_1
>EOG090X0FA0_0042_0035_1
>EOG090X0FA0_0035_0035_1
>EOG090X0FA0_0035_0042_1
Add a dictionary before the start of the loop:
running_inds = {}
for rec in SeqIO.parse("concatenate_0035_0042_dna2.fa", "fasta"):
Now when you perform
rec.name = rec.description = rec.id = x.iloc[0] + rec.id[rec.id.index("_"):]
first do the following:
running = running_inds.get(x.iloc[0] + rec.id[rec.id.index("_"):], 1) # Get the running index for this sequence
running_inds[x.iloc[0] + rec.id[rec.id.index("_"):]] = running + 1
now simply append this to the name:
rec.name = rec.description = rec.id = x.iloc[0] + rec.id[rec.id.index("_"):] + '_' + str(running)
Related
Here is my code:
inputFile = open("Employees.txt", "r").read()
inputList = inputFile.split("\n")
fList = []
def listString(s):
string = ""
return (string.join(s))
for i in inputList:
for x in i.split(","):
fList.append(x)
for y in range (len(fList)):
**if fList[y] == "90000":
fList[y] = str(90000 * 1.05) + "\n"
elif fList[y] == "75000":
fList[y] = str(75000 * 1.05) + "\n"
elif fList[y] == "110000":
fList[y] = str(110000 * 1.05) + "\n"
else:
fList[y] = fList[y] + ","**
print(listString(fList))
file = open("Emp_Bonus.txt", "a")
file.write(listString(fList))
Employees.txt contains the following:
Adam Lee,Programmer,90000
Morris Heather,DA,75000
John Lee,PM,110000
I am trying to get the following output:
Adam Lee,Programmer,94500
Morris Heather,DA,78750
John Lee,PM,115500
The part of the code that is in bold is the problem, The input salaries need to be able to be different values instead of the code only working for the sample input. The input salaries have to be multiplied by 1.05. How should I go about doing this? Thanks!
Another way without any library. Just read lines of the file as a list using readlines() and then iterate each line. Only modify the last part after splitting it using split(',') e.g salary of each line and finally create the new file as per the requirements.
multiply, final_result = 1.05, []
with open('Employees.txt', 'r') as f:
fList = f.readlines()
if fList:
for line in fList:
employee_info = line.split(',')
name = employee_info[0]
designation = employee_info[2]
salary = float(employee_info[2].replace('\n','').strip()) * multiply
final_result.append(f"{name},{employee_info[1]},{salary}")
if final_result:
with open('Emp_Bonus.txt', 'w') as f:
f.write('\n'.join(final_result))
Output:
Adam Lee,Programmer,94500.0
Morris Heather,DA,78750.0
John Lee,PM,115500.0
I will like to use Pandas:
import pandas as pd
df = pd.read_csv("Employees.txt",header=None)
df[2] = df.loc[df[2].isin([90000,75000,110000]),2]*1.05
df[2] = df[2].astype(int)
df.to_csv("Emp_Bonus.txt",mode="a",header=None)
I have 150 csv files with two column (time and site). I want to read each file, creating frequency dictionary ({'site':[site_number, number of occurrences site]}) and creating DataFrame consists of 11 columns (user_id, site1, site2, ...site10), user_id parsing from name of file (../user0001.csv). Each row in DataFrame unique session of 10 site visits. My codes worked on 150 files 150 seconds (its terrible). How can i improve it?
def prepare_3(path_to_csv_files, session_length=10):
word_freq = {}
freq_dict = {}
word_count = 0
row = []
columns = []
columns.append('user_id')
columns.extend(['site' + str(i) for i in range(1, session_length+1)])
lst_files = sorted(glob(path_to_csv_files))
for csv in lst_files:
user = int(csv[csv.find('.')-4:csv.find('.')])
frame = []
frame.append(user)
site_count = 0
with open(csv, 'r') as f:
f.readline()
for line in f:
site = line[line.find(',') + 1:].rstrip()
site_count += 1
if site in word_freq:
word_freq[site][1] += 1
else:
word_count += 1
word_freq[site] = [word_count, 1]
if site_count > session_length:
site_count = 1
row.append(frame)
frame = []
frame.append(user)
frame.append(word_freq[site][0])
else:
frame.append(word_freq[site][0])
row.append(frame)
df = pd.DataFrame(data=row, columns=columns, dtype=int)
df.fillna(0 ,inplace=True)
return df, word_freq
I need to slice a very long string (DNA sequences) in python, currently I'm using this:
new_seq = clean_seq[start:end]
I'm slicing about every 20000 characters, and taking 1000 long slices (approximately)
it's a 250MB file containing a few strings, identified each one with an id, this method is taking too long.
The sequence string comes from biopython module:
def fasta_from_ann(annotation, sequence, feature, windows, output_fasta):
df_gff = pd.read_csv(annotation, index_col=False, sep='\t',header=None)
df_gff.columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute']
fasta_seq = SeqIO.parse(sequence,'fasta')
buffer = []
for record in fasta_seq:
df_exctract = df_gff[(df_gff.seqname == record.id) & (df_gff.feature == feature)]
for k,v in df_exctract.iterrows():
clean_seq = ''.join(str(record.seq).splitlines())
if int(v.start) - windows < 0:
start = 0
else:
start = int(v.start) - windows
if int(v.end) + windows > len(clean_seq):
end = len(clean_seq)
else:
end = int(v.end) + windows
new_seq = clean_seq[start:end]
new_id = record.id + "_from_" + str(v.start) + "_to_" + str(v.end) + "_feature_" + v.feature
desc = "attribute: " + v.attribute + " strand: " + v.strand
seq = SeqRecord(Seq(new_seq), id=new_id,description = desc)
buffer.append(seq)
print(record.id)
SeqIO.write(buffer, output_fasta, "fasta")
Maybe there's a more memory-friendly way to accomplish this.
Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.
The Code Below I wrote takes input from a sample file which contains First and Last names. Then it converts those names to sample emails. For some reason the Script keeps printing the same Last name over and over.
namess.txt looks like this:
firstname,lastname
CODE:
import os, re, time, getpass, linecache
Original = os.path.join(os.path.expanduser('~'), 'Desktop','namess.txt')
File = os.path.join(os.path.expanduser('~'), 'Desktop','output.txt')
badNames = []
Names = []
def RemCommas():
outfile = open(os.path.join('C:\\', 'Users', getpass.getuser(), 'Desktop','output.txt'),'w')
Filedata = open(Original).read()
outfile.write(re.sub(',', ' ', Filedata))
outfile.close()
def ClassNum():
count = 6
Year = int(time.strftime('%Y'))
Class = str((Year - 2013) + 6)
return Class
def ReadStoreFile():
i = 0
OpenFile = open(File)
LenFile = len(OpenFile.readlines())
while i < LenFile:
i += 1
badNames.append(linecache.getline(File, i))
def CleanNames():
i = 0
while i < len(badNames):
cleaned = badNames[i].rstrip()
Names.append(cleaned)
i += 1
def NamePrint():
Interns = 'makchessclub.org'
arrayname = []
i = 0
j = 0
m = 0
while m < len(Names):
Name = Names[m]
Name = Name.lower()
InternName = Name[0] + Name[1]
#------------Checking for space and first name--
while i < len(Name):
if Name[i] == ' ':
i = Name.index(' ')
break;
i += 1
#---------------adding last name in an array----
Namelen = len(Name) - (i+1)
while j < Namelen:
arrayname.append(Name[i+1])
j += 1
i += 1
#---------------Final Name Print----------------
Lastname = ''.join(arrayname)
#print arrayname
#Lastname = Lastname.strip(' ')
#print InternName + Lastname + ClassNum() + Interns
file = open('C:\\Users\\username\\Desktop\\emails.txt', 'a')
file.write(InternName + Lastname + ClassNum() + Interns + '\n')
file.close()
m += 1
RemCommas()
ReadStoreFile()
CleanNames()
NamePrint()
print ''
os.system('pause')
The reason the last name doesn't change is because you are not resetting arrayname in your loop. You keep appending names to it, and the program picks the first one. So you should put your arrayname = [] after the while m < len(Names):
I guess this what you are trying to do:
import os
import re
import time
def create_mails(input_path, output_path, year, addr):
with open(input_path, 'r') as data:
mail = re.sub(r'(\w+)\s*,\s*(\w+)\n?', r'\1\g<2>%s%s\n' % (year, addr), data.read())
with open(output_path, 'w') as output:
output.write(mail.lower())
print 'Mail addresses generated and saved to', output_path
Demo:
create_mails(
os.path.join(os.path.expanduser('~'), 'Desktop', 'namess.txt'),
os.path.join(os.path.expanduser('~'), 'Desktop', 'output.txt'),
str(int(time.strftime('%Y')) - 2013 + 6),
'#makchessclub.org'
)
If namess.txt is something like this:
First, Last
John,Doe
Spam, Ham
Cabbage, egg
Then output.txt is going to be like this:
firstlast6#makchessclub.org
johndoe6#makchessclub.org
spamham6#makchessclub.org
cabbageegg6#makchessclub.org