Memory overflow in Python - python

I have 67000 files, I need to read them and extract similarities between the words, but when I run the code my laptop becomes much slower, I can't open any other application, and then a memory overflow error shows up (even when I run on around 10 000 of the files). Is there a way to clear the memory after every for loop maybe, or will running the code on all files be impossible to do? Below is the code:
def isAscii(s):
for c in s:
if c not in string.printable:
return False
return True
windowSize = 2
relationTable = {}
probabilities = {}
wordCount = {}
totalWordCount = 0
def sim(w1, w2):
numerator = 0
denominator = 0
if (w1 in relationTable) and (w2 in relationTable):
rtw1 = {}
rtw2 = {}
rtw1 = relationTable[w1]
rtw2 = relationTable[w2]
for word in rtw1:
rtw1_PMI = rtw1[word]['pairPMI']
denominator += rtw1_PMI
if(word in rtw2):
rtw2_PMI = rtw2[word]['pairPMI']
numerator += (rtw1_PMI + rtw2_PMI)
for word in rtw2:
rtw2_PMI = rtw2[word]['pairPMI']
denominator += rtw2_PMI
if(denominator != 0):
return float(numerator)/denominator
else:
return 0
else:
return -1
AllNotes = {}
AllNotes = os.listdir("C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes")
fileStopPunctuations = open('C:/Users/nerry-san/Desktop/EECE 502/stopPunctuations.txt')
stopPunctuations = nltk.word_tokenize(fileStopPunctuations.read())
for x in range (0, 10):
fileToRead = open('C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes/%s'%(AllNotes[x]))
case1 = fileToRead.read()
text = nltk.WordPunctTokenizer().tokenize(case1.lower())
final_text = []
for index in range(len(text)):
word = text[index]
if (word not in stopPunctuations):
final_text.append(word)
for index in range (len(final_text)):
w1 = final_text[index]
if(isAscii(w1)):
for index2 in range(-windowSize, windowSize+1):
if (index2 != 0):
if ( index + index2 ) in range (0, len(final_text)):
w2 = final_text[index + index2]
if(isAscii(w2)):
totalWordCount += 1
if (w1 not in wordCount):
wordCount[w1] = {}
wordCount[w1]['wCount'] = 0
try:
wordCount[w1][w2]['count'] += 1
wordCount[w1]['wCount'] += 1
except KeyError:
wordCount[w1][w2] = {'count':1}
wordCount[w1]['wCount'] += 1
for word in wordCount:
probabilities[word]={}
probabilities[word]['wordProb'] = float (wordCount[word]['wCount'])/ totalWordCount
for word in wordCount:
relationTable[word] = {}
for word2 in wordCount[word]:
if ( word2 != 'wCount'):
pairProb = float(wordCount[word][word2]['count'])/(wordCount[word]['wCount'])
relationTable[word][word2] = {}
relationTable[word][word2]['pairPMI'] = math.log(float(pairProb)/(probabilities[word]['wordProb'] * probabilities[word2]['wordProb']),2)
l = []
for word in relationTable:
l.append(word)
for index in range (0, len(l)):
word = l[index]
simValues = []
for index2 in range (0, len(l)):
word2 = l[index2]
if(word!= word2):
simVal = sim(word,word2)
if(simVal > 0):
simValues.append([word2, simVal])
simValues.sort(key= operator.itemgetter(1), reverse = True)

Every time you open a file, use the "with" statement. This will ensure the file is closed when the loop finishes (or rather when the with block is exited.

Related

How to turn a linear string into a trie?

I am using the make me a hanzi open-source chinese character dataset. As part of this dataset there are strings which provide the decomposition of chinese characters into their individual units (called radicals). I want to turn the strings describing the decomposition of characters into tries (so that I can use networkx to render the decompositions).
For example for this database entry:
{"character":"⺳","definition":"net, network","pinyin":[],"decomposition":"⿱冖八","radical":"⺳","matches":[[0],[0],[1],[1]]}
The decomposition for this character would be.
- Node(1, char='⿱')
- Node(2, char='冖') # an edge connects '⿱' to '冖'
- Node(3, char='八') # an edge connects '⿱' to '八'
So far, I have come up with a script to turn the string decompositions into dictionaries (but not graphs).
decomposition_types = {
'top-bottom': '⿱',
'left-right': '⿰',
'diagonal-corners': '⿻',
'over-under': '⿺',
'under-over': '⿹',
'over-under-reversed': '⿸',
'top-bottom-middle': '⿳',
'left-right-middle': '⿲',
'inside-outside': '⿴',
'outside-inside': '⿵',
'outside-inside2': '⿷',
'inside-outside2': '⿶'
# 'unknown': '?'
}
decomposition_types_reversed = dict(((value, key) for key, value in decomposition_types.items()))
file = []
if not os.path.isfile('data/dictionary.json'):
with open('data/dictionary.txt') as d:
for line in d:
file.append(json.loads(line))
for i, item in enumerate(file):
item['id'] = i + 1
json.dump(file, open('data/dictionary.json', 'w+'))
else:
file = json.load(open('data/dictionary.json'))
def is_parsed(blocks):
for block in blocks:
if not block['is_unit']:
return False
return True
def search(character, dictionary=file):
for hanzi in dictionary:
if hanzi['character'] == character:
return hanzi
return False
def parse(decomp):
if len(decomp) == 1:
return {"spacing": '?'}
blocks = []
n_loops = 0
for item in decomp:
blocks.append({"char": item, "is_spacing": item in decomposition_types_reversed, "is_unit": False})
while not is_parsed(blocks):
for i, item in enumerate(blocks):
if "is_spacing" in item:
if item['is_spacing']:
next_items = decomposition_types_reversed[item['char']].count('-') + 1
can_match = True
for x in blocks[i + 1:i + 1 + next_items]:
try:
if x['char'] in decomposition_types_reversed:
can_match = False
except KeyError:
pass
if can_match:
blocks[i] = {"spacing": item['char'],
"chars": [l['char'] if 'char' in l else l for l in
blocks[i + 1:i + 1 + next_items]],
"is_unit": True}
del blocks[i + 1:i + 1 + next_items]
n_loops += 1
if n_loops > 10:
print(decomp)
sys.exit()
return blocks

When i use a for loop with an array it doesn't work and uses the number of items instead of going item by item

Basically in the last for loop the k variable uses the number of items in the list and then I have a false and unique answer rather than multiple answers I want to do some sort of n roots of a complex number (if my question isn't clear sorry i'm not a native english speaker I'll do my best to make it clearer)
from math import *
deg = int(input("entrez le degré:"))
re = int(input("le réel:"))
im = int(input("l'imaginaire:"))
counter = 0
while counter < deg:
counter = counter + 1
kkk = []
kkk.append(counter)
r = sqrt(pow(re,2)+pow(im,2))
if im != 0:
teton = round(pi/radians(degrees(acos(re/r))),1)
else:
teton = round(pi/radians(degrees(acos(im/r))),1)
if round(r) != r:
r = "sqrt(",(pow(re,2)+pow(im,2)),")"
else:
r = r
teta = "pi/%s" %teton
print("z = ",r,"e^i",teta,)
for k in kkk:
if re != 0 or im != 0:
print(r,"e^i*2*",teta,"*",k,"pi")
else:
print(r,"^1/",deg,"e^i(",teta,"/",deg," +(2",k,"pi)/",deg)
print(k)
If I understood the problem correctly, you are saying that for loop is not iterating over all the items in the list kkk.
if you check your code the list kkk always have only one item as you are initializing and appending item in same loop.
please move below statement out of the first loop.
kkk = []
like below.
from math import *
deg = int(input("entrez le degré:"))
re = int(input("le réel:"))
im = int(input("l'imaginaire:"))
counter = 0
kkk = []
while counter < deg:
counter = counter + 1
kkk.append(counter)
r = sqrt(pow(re,2)+pow(im,2))
if im != 0:
teton = round(pi/radians(degrees(acos(re/r))),1)
else:
teton = round(pi/radians(degrees(acos(im/r))),1)
if round(r) != r:
r = "sqrt(",(pow(re,2)+pow(im,2)),")"
else:
r = r
teta = "pi/%s" %teton
print("z = ",r,"e^i",teta,)
for k in kkk:
if re != 0 or im != 0:
print(r,"e^i*2*",teta,"*",k,"pi")
else:
print(r,"^1/",deg,"e^i(",teta,"/",deg," +(2",k,"pi)/",deg)
print(k)

Python Last 6 Results, removing the last

I just can't get it done. Therefore I'll post the full code.
The .csv used is from http://www.football-data.co.uk/mmz4281/1415/E0.csv
Now when run, the variables home_team_a, home_team_d, away_team_a and away_team_d are based on all of the previous matches but I want them to be based always on the last 6 matches.
import csv, math, ast, numpy as np
def poisson(actual, mean):
return math.pow(mean, actual) * math.exp(-mean) / math.factorial(actual)
csvFile = '20152016.csv'
team_list = []
k = open('team_list.txt', 'w')
k.write("""{
""")
csvRead = csv.reader(open(csvFile))
next(csvRead)
for row in csvRead:
if row[2] not in team_list:
team_list.append(row[2])
if row[3] not in team_list:
team_list.append(row[3])
team_list.sort()
for team in team_list:
k.write(""" '%s': {'home_goals': 0, 'away_goals': 0, 'home_conceded': 0, 'away_conceded': 0, 'home_games': 0, 'away_games': 0, 'alpha_h': 0, 'beta_h': 0, 'alpha_a': 0, 'beta_a': 0},
""" % (team))
k.write("}")
k.close()
s = open('team_list.txt', 'r').read()
dict = ast.literal_eval(s)
GAMES_PLAYED = 0
WEEKS_WAIT = 4
TOTAL_VALUE = 0
csvRead = csv.reader(open(csvFile))
next(csvRead)
for game in csvRead:
home_team = game[2]
away_team = game[3]
home_goals = int(game[4])
away_goals = int(game[5])
home_win_prob = 0
draw_win_prob = 0
away_win_prob = 0
curr_home_goals = 0
curr_away_goals = 0
avg_home_goals = 1
avg_away_goals = 1
team_bet = ''
ev_bet = ''
# GETTING UPDATED VARIABLES
for key, value in dict.items():
curr_home_goals += dict[key]['home_goals']
curr_away_goals += dict[key]['away_goals']
if GAMES_PLAYED > (WEEKS_WAIT * 10):
avg_home_goals = curr_home_goals / (GAMES_PLAYED)
avg_away_goals = curr_away_goals / (GAMES_PLAYED)
# CALCULATING FACTORS
if GAMES_PLAYED > (WEEKS_WAIT * 10):
home_team_a = (dict[home_team]['alpha_h'] + dict[home_team]['alpha_a']) / 2
away_team_a = (dict[away_team]['alpha_h'] + dict[away_team]['alpha_a']) / 2
home_team_d = (dict[home_team]['beta_h'] + dict[home_team]['beta_a']) / 2
away_team_d = (dict[away_team]['beta_h'] + dict[away_team]['beta_a']) / 2
home_team_exp = avg_home_goals * home_team_a * away_team_d
away_team_exp = avg_away_goals * away_team_a * home_team_d
# RUNNING POISSON
l = open('poisson.txt', 'w')
for i in range(10):
for j in range(10):
prob = poisson(i, home_team_exp) * poisson(j, away_team_exp)
l.write("Prob%s%s = %s\n" % (i, j, prob))
l.close()
with open('poisson.txt') as f:
for line in f:
home_goals_m = int(line.split(' = ')[0][4])
away_goals_m = int(line.split(' = ')[0][5])
prob = float(line.split(' = ')[1])
if home_goals_m > away_goals_m:
home_win_prob += prob
elif home_goals_m == away_goals_m:
draw_win_prob += prob
elif home_goals_m < away_goals_m:
away_win_prob += prob
#CALCULATE VALUE
bet365odds_h, bet365odds_d, bet365odds_a = float(game[23]), float(game[24]), float(game[25])
ev_h = (home_win_prob * (bet365odds_h - 1)) - (1 - home_win_prob)
ev_d = (draw_win_prob * (bet365odds_d - 1)) - (1 - draw_win_prob)
ev_a = (away_win_prob * (bet365odds_a - 1)) - (1 - away_win_prob)
highestEV = max(ev_h, ev_d, ev_a)
if (ev_h == highestEV) and (ev_h > 0):
team_bet = home_team
ev_bet = ev_h
if home_goals > away_goals:
TOTAL_VALUE += (bet365odds_h - 1)
else:
TOTAL_VALUE -= 1
elif (ev_d == highestEV) and (ev_d > 0):
team_bet = 'Draw'
ev_bet = ev_d
if home_goals == away_goals:
TOTAL_VALUE += (bet365odds_d - 1)
else:
TOTAL_VALUE -= 1
elif (ev_a == highestEV) and (ev_a > 0):
team_bet = away_team
ev_bet = ev_a
if home_goals < away_goals:
TOTAL_VALUE += (bet365odds_a - 1)
else:
TOTAL_VALUE -= 1
if (team_bet != '') and (ev_bet != ''):
print ("Bet on '%s' (EV = %s)" % (team_bet, ev_bet))
print (TOTAL_VALUE)
# UPDATE VARIABLES AFTER MATCH HAS BEEN PLAYED
dict[home_team]['home_goals'] += home_goals
dict[home_team]['home_conceded'] += away_goals
dict[home_team]['home_games'] += 1
dict[away_team]['away_goals'] += away_goals
dict[away_team]['away_conceded'] += home_goals
dict[away_team]['away_games'] += 1
GAMES_PLAYED += 1
# CREATE FACTORS
if GAMES_PLAYED > (WEEKS_WAIT * 10):
for key, value in dict.items():
alpha_h = (dict[key]['home_goals'] / dict[key]['home_games']) / avg_home_goals
beta_h = (dict[key]['home_conceded'] / dict[key]['home_games']) / avg_away_goals
alpha_a = (dict[key]['away_goals'] / dict[key]['away_games']) / avg_away_goals
beta_a = (dict[key]['away_conceded'] / dict[key]['away_games']) / avg_home_goals
dict[key]['alpha_h'] = alpha_h
dict[key]['beta_h'] = beta_h
dict[key]['alpha_a'] = alpha_a
dict[key]['beta_a'] = beta_a
Use a deque to keep the 6 most recent items in memory; adding a new record will "push out" the oldest one.
import collections
import itertools
import csv
with open("foo.csv") as fh:
# Skip the first 44 rows
csv_read = islice(csv.reader(fh), 44, None)
# Initialize the deque with the next 6 rows
d = collections.deque(islice(csv_read, 6), 6)
for record in csv_read:
d.append(record)
print(list(d)) # Rows 46-51, then 47-52, then 48-53, etc
Because you set the maximum length of the deque to 6, each append to a "full" deque pushes out the older one. On the first iteration, d.append pushes out row 45 and adds row 51. On the next iteration, adding row 52 pushes out row 46, etc.
In general, a deque is a data structure that is like a combination of a queue and a stack; you can add or remove items to either end efficiently, but accessing an arbitrary item or modifying the "middle" is slow. Here, we're taking advantage of the fact that appending to a full deque causes an implicit removal from the opposite end.
How about:
if seen_records == 200:
recs = list(csvRead)[seen_records - 6:seen_records + 1]
You can do something like this....
previous_index = 0
previous_max = 6 # max number of previous numbers to remember
previous = [None for _ in range(previous_max)]
csvFile = 'X.csv'
seen_records = 0
csvRead = csv.reader(open(csvFile))
# Enumerate over the records to keep track of the index of each one
for i, records in enumerate(csvRead):
if (i > 50):
seen_records =+ 1
if previous_index == previous_max:
previous_index = 0 # Reset to the beginning when we reach the end
# Store the record and increment the index to the next location
previous[previous_index] = record
previous_index += 1
This creates a very basic array of length previous_max and just stores the oldest data at index 0 and newest at previous_max -1.

For exits loop before reaching range limit

First of all sorry for any common mistake. I'm not a native english speaker and I'm new to python.
As lug_par is a list with 4 items, the second for loop should do 4 iterations. In the third for loop the condition of the if is true so it should execute the break and get out of the loop. My problem is that it doesn't get into the third iteration, as if the len(lug_par) was 2 and not 4. It jumps straight into the cont_y = cont_y + 1
import re
string = "R(95DHS(60PST_35FDP_05MTR)_05A(95DHS"
lug_par = [s.start() for s in re.finditer('\(', string)]
lug_par_cierra = [s.start() for s in re.finditer('\)', string)]
cont_y = 0
for i in (0, len(lug_par_cierra)):
cont_x = 0
for j in (0, len(lug_par)):
if(lug_par[cont_x] > lug_par_cierra[cont_y]):
c = lug_par.index(cont_x)
borra = lug_par_cierra.index(c)
break
else:
print lug_par[cont_x]
cont_x = cont_x + 1
cont_y = cont_y + 1
I think you just forgot the calls to range() in the for-loops. It should work as you expect once they are corrected.
Here is the corrected code:
import re
string = "R(95DHS(60PST_35FDP_05MTR)_05A(95DHS"
lug_par = [s.start() for s in re.finditer('\(', string)]
lug_par_cierra = [s.start() for s in re.finditer('\)', string)]
cont_y = 0
for i in range(0, len(lug_par_cierra)): #range() added
cont_x = 0
for j in range(0, len(lug_par)): #range() added
if(lug_par[cont_x] > lug_par_cierra[cont_y]):
c = lug_par.index(cont_x)
borra = lug_par_cierra.index(c)
break
else:
print lug_par[cont_x]
cont_x = cont_x + 1
cont_y = cont_y + 1

Why my code is getting NZEC run time error?

Question source: SPOJ.. ORDERS
def swap(ary,idx1,idx2):
tmp = ary[idx1]
ary[idx1] = ary[idx2]
ary[idx2] = tmp
def mkranks(size):
tmp = []
for i in range(1, size + 1):
tmp = tmp + [i]
return tmp
def permutations(ordered, movements):
size = len(ordered)
for i in range(1, size): # The leftmost one never moves
for j in range(0, int(movements[i])):
swap(ordered, i-j, i-j-1)
return ordered
numberofcases = input()
for i in range(0, numberofcases):
sizeofcase = input()
tmp = raw_input()
movements = ""
for i in range(0, len(tmp)):
if i % 2 != 1:
movements = movements + tmp[i]
ordered = mkranks(sizeofcase)
ordered = permutations(ordered, movements)
output = ""
for i in range(0, sizeofcase - 1):
output = output + str(ordered[i]) + " "
output = output + str(ordered[sizeofcase - 1])
print output
Having made your code a bit more Pythonic (but without altering its flow/algorithm):
def swap(ary, idx1, idx2):
ary[idx1], ary[idx2] = [ary[i] for i in (idx2, idx1)]
def permutations(ordered, movements):
size = len(ordered)
for i in range(1, len(ordered)):
for j in range(movements[i]):
swap(ordered, i-j, i-j-1)
return ordered
numberofcases = input()
for i in range(numberofcases):
sizeofcase = input()
movements = [int(s) for s in raw_input().split()]
ordered = [str(i) for i in range(1, sizeofcase+1)]
ordered = permutations(ordered, movements)
output = " ".join(ordered)
print output
I see it runs correctly in the sample case given at the SPOJ URL you indicate. What is your failing case?

Categories

Resources