walking and averaging values in python - python

i have to process .txt files presnent in subfolder inside a Folder.like:
New Folder>Folder 1 to 6>xx.txt & yy.txt(files present in each folder)
each file contain two columns as:
arg his
asp gln
glu his
and
arg his
glu arg
arg his
glu asp
now what I have to do is :
1)count number of occurance of each word for each file > and average total count by dividing with total no. of lines in that file
2)then with values obtained after completing 1st step, divide the values with total no. of files present in the folder for averaging (i.e. 2 in this case)
I have tried with my code as follows:
but I have succeeded in 1st case but I'm not getting 2nd case.
for root,dirs,files in os.walk(path):
aspCount = 0
glu_count = 0
lys_count = 0
arg_count = 0
his_count = 0
acid_count = 0
base_count = 0
count = 0
listOfFile = glob.iglob(os.path.join(root,'*.txt')
for filename in listOfFile:
lineCount = 0
asp_count_col1 = 0
asp_count_col2 = 0
glu_count_col1 = 0
glu_count_col2 = 0
lys_count_col1 = 0
lys_count_col2 = 0
arg_count_col1 = 0
arg_count_col2 = 0
his_count_col1 = 0
his_count_col2 = 0
count += 1
for line in map(str.split,inp):
saltCount += 1
k = line[4]
m = line[6]
if k == 'ASP':
asp_count_col1 += 1
elif m == 'ASP':
asp_count_col2 += 1
if k == 'GLU':
glu_count_col += 1
elif m == 'GLU':
glu_count_col2 += 1
if k == 'LYS':
lys_count_col1 += 1
elif m == 'LYS':
lys_count_col2 += 1
if k == 'ARG':
arg_count_col1 += 1
elif m == 'ARG':
arg_count_col2 += 1
if k == 'HIS':
his_count_col1 += 1
elif m == 'HIS':
his_count_col2 += 1
asp_count = (float(asp_count_col1 + asp_count_col2))/lineCount
glu_count = (float(glu_count_col1 + glu_count_col2))/lineCount
lys_count = (float(lys_count_col1 + lys_count_col2))/lineCount
arg_count = (float(arg_count_col1 + arg_count_col2))/lineCount
his_count = (float(his_count_col1 + his_count_col2))/lineCount
upto this I could be able to get the average value per file. But how could I be able to get average per subfolder(i.e. by dividing with count(total no. of file)).
the problem is 2nd part. 1st part is done. The code provided will average values for each file. But I want to add this averages and make a new average by dividing with total no. of files present in the sub-folder.

import os
from collections import *
aminoAcids = set('asp glu lys arg his'.split())
filesToCounts = {}
for root,dirs,files in os.walk(subfolderPath):
for file in files:
if file.endswith('.txt'):
path = os.path.join(root,file)
with open(path) as f:
acidsInFile = f.read().split()
assert all(a in aminoAcids for a in acidsInFile)
filesToCounts[file] = Counter(acidsInFile)
def averageOfCounts(counts):
numberOfAcids = sum(counts.values())
assert numberOfAcids%2==0
numberOfAcidPairs = numberOfAcids/2
return dict((acid,acidCount/numberOfAcidPairs) for acid,acidCount in counts.items())
filesToAverages = dict((file,averageOfCounts(counts)) for file,counts in filesToCounts.items())

Your use of os.walk together with glob.iglob is bogus. Either use one or the other, not both together. Here's how I would do it:
import os, os.path, re, pprint, sys
#...
for root, dirs, files in os.walk(path):
counts = {}
nlines = 0
for f in filter(lambda n: re.search(r'\.txt$', n), files):
for l in open(f, 'rt'):
nlines += 1
for k in l.split():
counts[k] = counts[k]+1 if k in counts else 1
for k, v in counts.items():
counts[k] = float(v)/nlines
sys.stdout.write('Frequencies for directory %s:\n'%root
pprint.pprint(counts)

I like ninjagecko's answer but understand the question differently. Using his code as starting point I propose this:
import os
from collections import *
aminoAcids = set('asp glu lys arg his'.split())
subfolderFreqs = {}
for root,dirs,files in os.walk(subfolderPath):
cumulativeFreqs = defaultdict(int)
fileCount = 0
for file in files:
if file.endswith('.txt'):
fileCount += 1
path = os.path.join(root,file)
with open(path) as f:
acidsInFile = f.read().split()
counts = Counter(acidsInFile)
assert aminoAcids.issuperset(counts)
numberOfAcidPairs = len(acidsInFile)/2
for acid, acidCount in counts.items():
cumulativeFreqs[acid] += float(acidCount) / numberOfAcidPairs
if fileCount:
subfolderFreqs[root] = {acid: cumulative/fileCount for acid, cumulative in cumulativeFreqs.items()}
print subfolderFreqs

Related

Parse filenames and make a total count of occurances of the first two words, over 50K names

I wanted to make a count of the total number of filenames that had the same 2 unique words in the start of the filename (tree names or other identifier), and iterate through the whole folder (over 50K files), recording those names and give individual and overall totals for the occurances.
The filenames look something like this, or variations thereof:
Abies_alba_0_2545_WEFL_NLF.tif
Abies_alba_8_321565_WEFL_NLF.tif
Larix_kaempferi_3_43357_WEFL_NLF.tif
I actually managed a workaround and got the results that I wanted - but this was very slow for me to manually capture the key parts of the string, and manually repeat the script. I used How to count number of files in a file with certain extension or name? as a basis and produced this:
import glob
import os
# These are our counters
total_count = 0
Abies_alba_count = 0
Acer_pseudoplatanus_count = 0
Alnus_spec_count = 0
Betula_spec_count = 0
Cleared_0_count = 0
Fagus_sylvatica_count = 0
Fraxinus_excelsior_count = 0
Larix_decidua_count = 0
Larix_kaempferi_count = 0
Picea_abies_count = 0
Pinus_nigra_count = 0
Pinus_strobus_count = 0
Pinus_sylvestris_count = 0
Populus_spec_count = 0
Prunus_spec_count = 0
Pseudotsuga_menziesii_count = 0
Quercus_petraea_count = 0
Quercus_robur_count = 0
Quercus_rubra_count = 0
Tilia_spec_count = 0
for file in
os.listdir(r'FILEPATH TO FOLDER HOLDING THE FILES'):
if(file.endswith('tif')):
total_count += 1
if 'Abies_alba' in file:
Abies_alba_count += 1
if 'Acer_pseudoplatanus' in file:
Acer_pseudoplatanus_count += 1
if 'Alnus_spec' in file:
Alnus_spec_count += 1
if 'Betula_spec' in file:
Betula_spec_count += 1
if 'Cleared_0' in file:
Cleared_0_count += 1
if 'Fagus_sylvatica' in file:
Fagus_sylvatica_count += 1
if 'Fraxinus_excelsior' in file:
Fraxinus_excelsior_count += 1
if 'Larix_decidua' in file:
Larix_decidua_count += 1
if 'Larix_kaempferi' in file:
Larix_kaempferi_count += 1
if 'Picea_abies' in file:
Picea_abies_count += 1
if 'Pinus_nigra' in file:
Pinus_nigra_count += 1
if 'Pinus_strobus' in file:
Pinus_strobus_count += 1
if 'Pinus_sylvestris' in file:
Pinus_sylvestris_count += 1
if 'Populus_spec' in file:
Populus_spec_count += 1
if 'Prunus_spec' in file:
Prunus_spec_count += 1
if 'Pseudotsuga_menziesii' in file:
Pseudotsuga_menziesii_count += 1
if 'Quercus_petraea' in file:
Quercus_petraea_count += 1
if 'Quercus_robur' in file:
Quercus_robur_count += 1
if 'Quercus_rubra' in file:
Quercus_rubra_count += 1
if 'Tilia_spec' in file:
Tilia_spec_count += 1
print('Abies alba:', Abies_alba_count)
print('Acer pseudoplatanus:', Acer_pseudoplatanus_count)
print('Alnus spec:', Alnus_spec_count)
print('Betula_spec:', Betula_spec_count)
print('Cleared 0:', Cleared_0_count)
print('Fagus sylvatica:', Fagus_sylvatica_count)
print('Fraxinus excelsior:', Fraxinus_excelsior_count)
print('Larix decidua:', Larix_decidua_count)
print('Larix kaempferi:', Larix_kaempferi_count)
print('Picea abies:', Picea_abies_count)
print('Pinus nigra:', Pinus_nigra_count)
print('Pinus strobus:', Pinus_strobus_count)
print('Pinus sylvestris:', Pinus_sylvestris_count)
print('Populus spec:', Populus_spec_count)
print('Prunus spec:', Prunus_spec_count)
print('Pseudotsuga menziesii:', Pseudotsuga_menziesii_count)
print('Quercus petraea:', Quercus_petraea_count)
print('Quercus robur:', Quercus_robur_count)
print('Quercus rubra:', Quercus_rubra_count)
print('Tilia spec:', Tilia_spec_count)
print('Total:', total_count)
Which works and gives me the results I wanted, as below:
Abies alba: 984
Acer pseudoplatanus: 2821
Alnus spec: 2563
Betula_spec: 2821
Cleared 0: 4123
Fagus sylvatica: 6459
Fraxinus excelsior: 2634
Larix decidua: 1360
Larix kaempferi: 1748
Picea abies: 5783
Pinus nigra: 421
Pinus strobus: 500
Pinus sylvestris: 6591
Populus spec: 464
Prunus spec: 304
Pseudotsuga menziesii: 2691
Quercus petraea: 2608
Quercus robur: 3453
Quercus rubra: 1841
Tilia spec: 212
Total: 50381
So, yes, this works but was awful to do and I understand to be smelly code if Im using the term correctly! Could someone advise on how to get to the end result without all the manual interference that I had to do?
I intend to also plot the output in some follow up work, showing the number weighting of the file/tree types, but was trying to avoid writing to a CSV and working the results from there as I understand that to be bad practice. Any further tips for this?
Use a dictionary! Updated to dynamically determine the words to search.
words = set()
counts = {}
total_count = 0
file_names = os.listdir(r'FILEPATH TO FOLDER HOLDING THE FILES')
for file_name in file_names:
fn_words = file_name.split("_")
words.add(f"{fn_words[0]} {fn_words[1]}")
for word in words:
for file_name in file_names:
if word in file_name and word in counts:
counts[word] += 1
total_count += 1
elif word in file_name:
counts[word] = 1
total_count += 1
[print(f"{word}: {count}" for word, count in counts.items()] ### You could iterate through this dict in any way if you would rather process the data in some other fashion than just printing.
print(f"Total: {total_count}")
You can try this.
import os
file_names = os.listdir('YOUR_DIRECTORY')
file_counts = {}
for filename in file_names:
if filename.endswith('tif'):
filename_parts = filename.split('_')
key = filename_parts[0] + '_' + filename_parts[1]
if key in file_counts.keys():
file_counts[key] += 1
else:
file_counts[key] = 1
print(file_counts)

When i use a for loop with an array it doesn't work and uses the number of items instead of going item by item

Basically in the last for loop the k variable uses the number of items in the list and then I have a false and unique answer rather than multiple answers I want to do some sort of n roots of a complex number (if my question isn't clear sorry i'm not a native english speaker I'll do my best to make it clearer)
from math import *
deg = int(input("entrez le degré:"))
re = int(input("le réel:"))
im = int(input("l'imaginaire:"))
counter = 0
while counter < deg:
counter = counter + 1
kkk = []
kkk.append(counter)
r = sqrt(pow(re,2)+pow(im,2))
if im != 0:
teton = round(pi/radians(degrees(acos(re/r))),1)
else:
teton = round(pi/radians(degrees(acos(im/r))),1)
if round(r) != r:
r = "sqrt(",(pow(re,2)+pow(im,2)),")"
else:
r = r
teta = "pi/%s" %teton
print("z = ",r,"e^i",teta,)
for k in kkk:
if re != 0 or im != 0:
print(r,"e^i*2*",teta,"*",k,"pi")
else:
print(r,"^1/",deg,"e^i(",teta,"/",deg," +(2",k,"pi)/",deg)
print(k)
If I understood the problem correctly, you are saying that for loop is not iterating over all the items in the list kkk.
if you check your code the list kkk always have only one item as you are initializing and appending item in same loop.
please move below statement out of the first loop.
kkk = []
like below.
from math import *
deg = int(input("entrez le degré:"))
re = int(input("le réel:"))
im = int(input("l'imaginaire:"))
counter = 0
kkk = []
while counter < deg:
counter = counter + 1
kkk.append(counter)
r = sqrt(pow(re,2)+pow(im,2))
if im != 0:
teton = round(pi/radians(degrees(acos(re/r))),1)
else:
teton = round(pi/radians(degrees(acos(im/r))),1)
if round(r) != r:
r = "sqrt(",(pow(re,2)+pow(im,2)),")"
else:
r = r
teta = "pi/%s" %teton
print("z = ",r,"e^i",teta,)
for k in kkk:
if re != 0 or im != 0:
print(r,"e^i*2*",teta,"*",k,"pi")
else:
print(r,"^1/",deg,"e^i(",teta,"/",deg," +(2",k,"pi)/",deg)
print(k)

Python Pandas How to save output to csv

Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.

optimizing my Benfold's law program

lines=[]
count1 = 0
count2 = 0
count3 = 0
count4 = 0
count5 = 0
count6 = 0
count7 = 0
count8 = 0
count9 = 0
allcount = 0
with open('city_all.txt', 'r') as file:
for line in file:
lines.append(line.strip())
for x in range(0,len(lines)):
if lines[x].isdigit():
allcount+=1
string = lines[x]
if string[0]=="1":
count1+=1
elif string[0]=="2":
count2+=1
elif string[0]=="3":
count3+=1
elif string[0]=="4":
count4+=1
elif string[0]=="5":
count5+=1
elif string[0]=="6":
count6+=1
elif string[0]=="7":
count7+=1
elif string[0]=="8":
count8+=1
elif string[0]=="9":
count9+=1
print(count1/allcount)
print('{:.1%}'.format(count1/allcount))
Wondering if there is anyway to not have to declare all my variables, and compact all the if statements?Trying to make a program to help compute Benfold's law, so I am putting a txt file into a list, then going through each element and checking what the starting digit is.
You can simplify it a bit:
counts = [0 for _ in range (10) ]
with open('city_all.txt', 'r') as f:
for line in (x.strip () for x in f):
if line.isdigit():
allcount += 1
try: counts[int(line)] += 1
except IndexError: pass

Memory overflow in Python

I have 67000 files, I need to read them and extract similarities between the words, but when I run the code my laptop becomes much slower, I can't open any other application, and then a memory overflow error shows up (even when I run on around 10 000 of the files). Is there a way to clear the memory after every for loop maybe, or will running the code on all files be impossible to do? Below is the code:
def isAscii(s):
for c in s:
if c not in string.printable:
return False
return True
windowSize = 2
relationTable = {}
probabilities = {}
wordCount = {}
totalWordCount = 0
def sim(w1, w2):
numerator = 0
denominator = 0
if (w1 in relationTable) and (w2 in relationTable):
rtw1 = {}
rtw2 = {}
rtw1 = relationTable[w1]
rtw2 = relationTable[w2]
for word in rtw1:
rtw1_PMI = rtw1[word]['pairPMI']
denominator += rtw1_PMI
if(word in rtw2):
rtw2_PMI = rtw2[word]['pairPMI']
numerator += (rtw1_PMI + rtw2_PMI)
for word in rtw2:
rtw2_PMI = rtw2[word]['pairPMI']
denominator += rtw2_PMI
if(denominator != 0):
return float(numerator)/denominator
else:
return 0
else:
return -1
AllNotes = {}
AllNotes = os.listdir("C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes")
fileStopPunctuations = open('C:/Users/nerry-san/Desktop/EECE 502/stopPunctuations.txt')
stopPunctuations = nltk.word_tokenize(fileStopPunctuations.read())
for x in range (0, 10):
fileToRead = open('C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes/%s'%(AllNotes[x]))
case1 = fileToRead.read()
text = nltk.WordPunctTokenizer().tokenize(case1.lower())
final_text = []
for index in range(len(text)):
word = text[index]
if (word not in stopPunctuations):
final_text.append(word)
for index in range (len(final_text)):
w1 = final_text[index]
if(isAscii(w1)):
for index2 in range(-windowSize, windowSize+1):
if (index2 != 0):
if ( index + index2 ) in range (0, len(final_text)):
w2 = final_text[index + index2]
if(isAscii(w2)):
totalWordCount += 1
if (w1 not in wordCount):
wordCount[w1] = {}
wordCount[w1]['wCount'] = 0
try:
wordCount[w1][w2]['count'] += 1
wordCount[w1]['wCount'] += 1
except KeyError:
wordCount[w1][w2] = {'count':1}
wordCount[w1]['wCount'] += 1
for word in wordCount:
probabilities[word]={}
probabilities[word]['wordProb'] = float (wordCount[word]['wCount'])/ totalWordCount
for word in wordCount:
relationTable[word] = {}
for word2 in wordCount[word]:
if ( word2 != 'wCount'):
pairProb = float(wordCount[word][word2]['count'])/(wordCount[word]['wCount'])
relationTable[word][word2] = {}
relationTable[word][word2]['pairPMI'] = math.log(float(pairProb)/(probabilities[word]['wordProb'] * probabilities[word2]['wordProb']),2)
l = []
for word in relationTable:
l.append(word)
for index in range (0, len(l)):
word = l[index]
simValues = []
for index2 in range (0, len(l)):
word2 = l[index2]
if(word!= word2):
simVal = sim(word,word2)
if(simVal > 0):
simValues.append([word2, simVal])
simValues.sort(key= operator.itemgetter(1), reverse = True)
Every time you open a file, use the "with" statement. This will ensure the file is closed when the loop finishes (or rather when the with block is exited.

Categories

Resources