I have 150 csv files with two column (time and site). I want to read each file, creating frequency dictionary ({'site':[site_number, number of occurrences site]}) and creating DataFrame consists of 11 columns (user_id, site1, site2, ...site10), user_id parsing from name of file (../user0001.csv). Each row in DataFrame unique session of 10 site visits. My codes worked on 150 files 150 seconds (its terrible). How can i improve it?
def prepare_3(path_to_csv_files, session_length=10):
word_freq = {}
freq_dict = {}
word_count = 0
row = []
columns = []
columns.append('user_id')
columns.extend(['site' + str(i) for i in range(1, session_length+1)])
lst_files = sorted(glob(path_to_csv_files))
for csv in lst_files:
user = int(csv[csv.find('.')-4:csv.find('.')])
frame = []
frame.append(user)
site_count = 0
with open(csv, 'r') as f:
f.readline()
for line in f:
site = line[line.find(',') + 1:].rstrip()
site_count += 1
if site in word_freq:
word_freq[site][1] += 1
else:
word_count += 1
word_freq[site] = [word_count, 1]
if site_count > session_length:
site_count = 1
row.append(frame)
frame = []
frame.append(user)
frame.append(word_freq[site][0])
else:
frame.append(word_freq[site][0])
row.append(frame)
df = pd.DataFrame(data=row, columns=columns, dtype=int)
df.fillna(0 ,inplace=True)
return df, word_freq
Related
i have a text file that contains datas on lines like his:
Total usage 2022-09-17T06:02:50+02:00 for vob "/Tmobile_Cro/TMobileCro" is 7868.0 Mb
Total usage 2022-09-17T06:04:18+02:00 for vob "/XpressCM/SX-BASE" is 25265.7 Mb
Total usage 2022-09-17T06:02:56+02:00 for vob "/vobs/LI" is 5916.9 Mb
I want to process this data and export into an excel file as:
TAG Size
/Tmobile_Cro/TmobileCro 7868.0 Mb
/XpressCM/SX-BASE 25265.7 Mb
This is my code:
import xlrd, xlwt, re
from svnscripts.timestampdirectory import createdir, path_dir
import os
def clearcasesize():
pathdest = path_dir()
dest = createdir()
txtName = rf"{pathdest}\vobs_size.txt"
workBook = xlwt.Workbook(encoding='ascii')
workSheet = workBook.add_sheet('sheet1')
fp = open(txtName, 'r+b')
workSheet.write(0, 0, "TAG")
workSheet.write(0, 1, "Size")
row = 0
entries = 0
fullentry = []
for linea in fp.readlines():
str_linea = linea.decode('gb2312', 'ignore')
str_linea = str_linea[:-2] # str string
txt = str_linea
arr = str_linea
if arr[:9] == "Total":
txt = arr
entries += 1
s = txt.index("/")
e = txt.index('"', s)
txt = txt[s:e]
fullentry.append(txt)
elif arr.find("is") >= 0:
entries += 1
txt = arr
s = txt.index("is")
txt1 = txt[s + 7:20]
fullentry.append(txt1)
if (row == 65536):
break;
finalarr = []
finalarr1 = []
temp = 0
row = 1
for r in fullentry:
finalarr.append(r)
temp += 1
if temp == 12:
finalarr1.append(finalarr)
temp = 0
col = 0
for arr in finalarr:
workSheet.write(row, col, arr)
col += 1
row += 1
finalarr.clear()
if (row == 65536):
break;
workBook.save(os.path.join(dest, "ClearcaseSize.xls"))
fp.close()
clearcasesize()
The code should work, but its only creating the name of columns "Tag and Size"
The idea of my script is that he locate the first argg of the line "Total", and after he puts into 1 column the args that we find into " " , and after using the other arr.fin "is" to add the size into the second line, but its not working...
Based on your short sample:
Total usage 2022-09-17T06:02:50+02:00 for vob "/Tmobile_Cro/TMobileCro" is 7868.0 Mb
Total usage 2022-09-17T06:04:18+02:00 for vob "/XpressCM/SX-BASE" is 25265.7 Mb
Total usage 2022-09-17T06:02:56+02:00 for vob "/vobs/LI" is 5916.9 Mb
Why don't you try pandas like this:
import pandas as pd
with open("sample_file.txt") as f:
lines = [
line.strip().split(" vob ")[-1].replace('"', "").split(" is ")
for line in f.readlines()
]
df = pd.DataFrame(
[l for l in lines if not l[0].startswith("/vobs")],
columns=["TAG", "Size"],
)
df.to_excel("sample_file.xlsx", index=False)
I don't have Excel but you should get an xlsx file that looks like this:
I am trying to Call Api in a while loop and append the dataframe. But it is not appending .
#Max timestamp
MaxTs = 1635876000
api_key = "api_key"
cnt = 0
while cnt < 4:
url = f"https://min-api.cryptocompare.com/data/v2/histohour?fsym=BTC&tsym=USD&limit=2000&toTs={MaxTs}&api_key={api_key}"
r = requests.get(url)
data = r.json()
price_df = pd.DataFrame(data['Data']['Data'])
i = 0
reccnt = 2000
while i < reccnt:
currTs = price_df.iloc[i]['time']
if currTs < MaxTs:
MaxTs = currTs
i = i + 1
if cnt == 0:
#Copying the Orginal df to new df.
newdf = price_df.copy()
else:
#when counter increases append the df.
newdf.append(price_df)
print(MaxTs)
cnt = cnt + 1
You should increase cnt inside the while loop, not outside.
But after you perform a correction you will get several copies of the same price_df. Is that what you are trying to get?
I actually have a dataframe such:
old_name new_name pident length
gene1_0035_0042 geneA 100 560
gene2_0035_0042 geneA 100 545
gene3_0042_0035 geneB 99 356
gene4_0042_0035 geneB 97 256
gene6_0035_0042 geneB 96 567
and here is the fasta file (example):
>gene1_0035_0042
ATTGAC
>gene2_0035_0042
ATGAGCC
>gene3_0042_0035
AGCCAG
>gene4_0042_0035
AGCCAT
>gene6_0035_0042
AGCCATG
in fact I wrote a script to replace in a fasta file the old_name of the sequence by the new_name by doing: (qseqid = old_name and Busco_ID = new_names in the ex).
blast=pd.read_table("matches_Busco_0035_0042_best_hit.m8",header=None)
blast.columns = ["qseqid", "Busco_ID", "pident", "length", "mismatch", "gapopen","qstart", "qend", "sstart", "send", "evalue", "bitscore"]
repl = blast[blast.pident > 95]
repl.to_csv("busco_blast_non-rename.txt",sep='\t')
qseqid=repl.ix[:,0]
Busco_ID=repl.ix[:,1]
newfile = []
count = 0
running_inds = {}
for rec in SeqIO.parse("concatenate_0035_0042_dna2.fa", "fasta"):
#get corresponding value for record ID from dataframe
#repl["seq"] and "newseq" are the pandas column with the old and new sequence names, respectively
x = repl.loc[repl["qseqid"] == rec.id, "Busco_ID"]
#change record, if not empty
if x.any():
#append old identifier number to the new id name
running = running_inds.get(x.iloc[0], 1) # Get the running index for this sequence
running_inds[x.iloc[0]] = running + 1
rec.name = rec.description = rec.id = x.iloc[0] + rec.id[rec.id.index("_"):]
count += 1
#append record to list
newfile.append(rec)
#write list into new fasta file
SeqIO.write(newfile, "concatenate_with_busco_names_0035_0042_dna.fa", "fasta")
#tell us, how hard you had to work for us
print("I changed {} entries!".format(count))
as you can see I only filter my sequence by keeping these with a pident > 95 but as you can see I will get for all these sequences the same name which is the new_name but instead of that, I would like to add a number at the end of the new name. For the above example it would give in the fasta file:
>geneA_0035_0042_1
ATTGAC
>geneA_0035_0042_2
ATGAGCC
>geneB_0042_0035_1
AGCCAG
>geneB_0042_0035_2
AGCCAT
>geneB_0035_0042_1
AGCCATG
and so on
instead of:
>geneA_0035_0042
ATTGAC
>geneA_0035_0042
ATGAGCC
>geneB_0042_0035
AGCCAG
>geneB_0042_0035
AGCCAT
>geneB_0035_0042
AGCCATG
as my script does
Thanks for your help
Issue:
I got:
>EOG090X0FA0_0042_0042_1
>EOG090X0FA0_0042_0035_2
>EOG090X0FA0_0035_0035_3
>EOG090X0FA0_0035_0042_4
but since they are all different I should get:
>EOG090X0FA0_0042_0042_1
>EOG090X0FA0_0042_0035_1
>EOG090X0FA0_0035_0035_1
>EOG090X0FA0_0035_0042_1
Add a dictionary before the start of the loop:
running_inds = {}
for rec in SeqIO.parse("concatenate_0035_0042_dna2.fa", "fasta"):
Now when you perform
rec.name = rec.description = rec.id = x.iloc[0] + rec.id[rec.id.index("_"):]
first do the following:
running = running_inds.get(x.iloc[0] + rec.id[rec.id.index("_"):], 1) # Get the running index for this sequence
running_inds[x.iloc[0] + rec.id[rec.id.index("_"):]] = running + 1
now simply append this to the name:
rec.name = rec.description = rec.id = x.iloc[0] + rec.id[rec.id.index("_"):] + '_' + str(running)
Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.
I am trying to retrieve all data from an sqlite3 database, format it and save it to a file, read that file and try to sort the records alphabetically then save that to a new file but It doesn't save them to the new file properly and i end up with more than one copy of that record. Can somebody help?
What I am trying to do:
Retrieve records from a database
Format these records
Save to a (unsorted) file
Take out the records from the (unsorted) file
Sort the records alphabetically
Save the (sorted) records to a new file
Text to avoid code after list bug
c.execute("SELECT * FROM Student, Behaviour")
data = c.fetchall()
currentRecords = open('Current Records - Unsorted', 'w')
l = []
for i in data: #for individual records in the whole database do:
record = str(i)
record = record.replace("u'","")
record = record.replace("'", "")
record = record.replace('"', '')
record = record.replace("(","")
record = record.replace(")", "")
record = record.replace(","," -")
currentRecords.write(record+"\r\n")
currentRecords.write('------------------------------------------------------------------------------'+"\r\n")
currentRecords.close()
y = open('Current Records - Unsorted','r')
z = y.read() #opening the file containing the unsorted, formatted records to read
l.append(z)
y.close()
#sort the array alphabetically
def sort(l):
less = []
equal = []
greater = []
if len(l) > 1:
pivot = l[0]
for x in l:
if x < pivot:
less.append(x)
elif x == pivot:
equal.append(x)
elif x > pivot:
greater.append(x)
return sort(less) + equal + sort(greater)
else:
if len(l) == 1 or len(l) == 0:
return l
sortedCurrentRecords = sort(l)
sortedCurrentRecordsFile = open('Current Records', 'w')
for individualRecords in sortedCurrentRecords:
sortedCurrentRecordsFile.write(individualRecords)
sortedCurrentRecordsFile.close()
sortedCurrentRecordsFile1 = 'Current Records'
subprocess.call(['open','-a','TextEdit', sortedCurrentRecordsFile1])