Python Last 6 Results, removing the last - python

I just can't get it done. Therefore I'll post the full code.
The .csv used is from http://www.football-data.co.uk/mmz4281/1415/E0.csv
Now when run, the variables home_team_a, home_team_d, away_team_a and away_team_d are based on all of the previous matches but I want them to be based always on the last 6 matches.
import csv, math, ast, numpy as np
def poisson(actual, mean):
return math.pow(mean, actual) * math.exp(-mean) / math.factorial(actual)
csvFile = '20152016.csv'
team_list = []
k = open('team_list.txt', 'w')
k.write("""{
""")
csvRead = csv.reader(open(csvFile))
next(csvRead)
for row in csvRead:
if row[2] not in team_list:
team_list.append(row[2])
if row[3] not in team_list:
team_list.append(row[3])
team_list.sort()
for team in team_list:
k.write(""" '%s': {'home_goals': 0, 'away_goals': 0, 'home_conceded': 0, 'away_conceded': 0, 'home_games': 0, 'away_games': 0, 'alpha_h': 0, 'beta_h': 0, 'alpha_a': 0, 'beta_a': 0},
""" % (team))
k.write("}")
k.close()
s = open('team_list.txt', 'r').read()
dict = ast.literal_eval(s)
GAMES_PLAYED = 0
WEEKS_WAIT = 4
TOTAL_VALUE = 0
csvRead = csv.reader(open(csvFile))
next(csvRead)
for game in csvRead:
home_team = game[2]
away_team = game[3]
home_goals = int(game[4])
away_goals = int(game[5])
home_win_prob = 0
draw_win_prob = 0
away_win_prob = 0
curr_home_goals = 0
curr_away_goals = 0
avg_home_goals = 1
avg_away_goals = 1
team_bet = ''
ev_bet = ''
# GETTING UPDATED VARIABLES
for key, value in dict.items():
curr_home_goals += dict[key]['home_goals']
curr_away_goals += dict[key]['away_goals']
if GAMES_PLAYED > (WEEKS_WAIT * 10):
avg_home_goals = curr_home_goals / (GAMES_PLAYED)
avg_away_goals = curr_away_goals / (GAMES_PLAYED)
# CALCULATING FACTORS
if GAMES_PLAYED > (WEEKS_WAIT * 10):
home_team_a = (dict[home_team]['alpha_h'] + dict[home_team]['alpha_a']) / 2
away_team_a = (dict[away_team]['alpha_h'] + dict[away_team]['alpha_a']) / 2
home_team_d = (dict[home_team]['beta_h'] + dict[home_team]['beta_a']) / 2
away_team_d = (dict[away_team]['beta_h'] + dict[away_team]['beta_a']) / 2
home_team_exp = avg_home_goals * home_team_a * away_team_d
away_team_exp = avg_away_goals * away_team_a * home_team_d
# RUNNING POISSON
l = open('poisson.txt', 'w')
for i in range(10):
for j in range(10):
prob = poisson(i, home_team_exp) * poisson(j, away_team_exp)
l.write("Prob%s%s = %s\n" % (i, j, prob))
l.close()
with open('poisson.txt') as f:
for line in f:
home_goals_m = int(line.split(' = ')[0][4])
away_goals_m = int(line.split(' = ')[0][5])
prob = float(line.split(' = ')[1])
if home_goals_m > away_goals_m:
home_win_prob += prob
elif home_goals_m == away_goals_m:
draw_win_prob += prob
elif home_goals_m < away_goals_m:
away_win_prob += prob
#CALCULATE VALUE
bet365odds_h, bet365odds_d, bet365odds_a = float(game[23]), float(game[24]), float(game[25])
ev_h = (home_win_prob * (bet365odds_h - 1)) - (1 - home_win_prob)
ev_d = (draw_win_prob * (bet365odds_d - 1)) - (1 - draw_win_prob)
ev_a = (away_win_prob * (bet365odds_a - 1)) - (1 - away_win_prob)
highestEV = max(ev_h, ev_d, ev_a)
if (ev_h == highestEV) and (ev_h > 0):
team_bet = home_team
ev_bet = ev_h
if home_goals > away_goals:
TOTAL_VALUE += (bet365odds_h - 1)
else:
TOTAL_VALUE -= 1
elif (ev_d == highestEV) and (ev_d > 0):
team_bet = 'Draw'
ev_bet = ev_d
if home_goals == away_goals:
TOTAL_VALUE += (bet365odds_d - 1)
else:
TOTAL_VALUE -= 1
elif (ev_a == highestEV) and (ev_a > 0):
team_bet = away_team
ev_bet = ev_a
if home_goals < away_goals:
TOTAL_VALUE += (bet365odds_a - 1)
else:
TOTAL_VALUE -= 1
if (team_bet != '') and (ev_bet != ''):
print ("Bet on '%s' (EV = %s)" % (team_bet, ev_bet))
print (TOTAL_VALUE)
# UPDATE VARIABLES AFTER MATCH HAS BEEN PLAYED
dict[home_team]['home_goals'] += home_goals
dict[home_team]['home_conceded'] += away_goals
dict[home_team]['home_games'] += 1
dict[away_team]['away_goals'] += away_goals
dict[away_team]['away_conceded'] += home_goals
dict[away_team]['away_games'] += 1
GAMES_PLAYED += 1
# CREATE FACTORS
if GAMES_PLAYED > (WEEKS_WAIT * 10):
for key, value in dict.items():
alpha_h = (dict[key]['home_goals'] / dict[key]['home_games']) / avg_home_goals
beta_h = (dict[key]['home_conceded'] / dict[key]['home_games']) / avg_away_goals
alpha_a = (dict[key]['away_goals'] / dict[key]['away_games']) / avg_away_goals
beta_a = (dict[key]['away_conceded'] / dict[key]['away_games']) / avg_home_goals
dict[key]['alpha_h'] = alpha_h
dict[key]['beta_h'] = beta_h
dict[key]['alpha_a'] = alpha_a
dict[key]['beta_a'] = beta_a

Use a deque to keep the 6 most recent items in memory; adding a new record will "push out" the oldest one.
import collections
import itertools
import csv
with open("foo.csv") as fh:
# Skip the first 44 rows
csv_read = islice(csv.reader(fh), 44, None)
# Initialize the deque with the next 6 rows
d = collections.deque(islice(csv_read, 6), 6)
for record in csv_read:
d.append(record)
print(list(d)) # Rows 46-51, then 47-52, then 48-53, etc
Because you set the maximum length of the deque to 6, each append to a "full" deque pushes out the older one. On the first iteration, d.append pushes out row 45 and adds row 51. On the next iteration, adding row 52 pushes out row 46, etc.
In general, a deque is a data structure that is like a combination of a queue and a stack; you can add or remove items to either end efficiently, but accessing an arbitrary item or modifying the "middle" is slow. Here, we're taking advantage of the fact that appending to a full deque causes an implicit removal from the opposite end.

How about:
if seen_records == 200:
recs = list(csvRead)[seen_records - 6:seen_records + 1]

You can do something like this....
previous_index = 0
previous_max = 6 # max number of previous numbers to remember
previous = [None for _ in range(previous_max)]
csvFile = 'X.csv'
seen_records = 0
csvRead = csv.reader(open(csvFile))
# Enumerate over the records to keep track of the index of each one
for i, records in enumerate(csvRead):
if (i > 50):
seen_records =+ 1
if previous_index == previous_max:
previous_index = 0 # Reset to the beginning when we reach the end
# Store the record and increment the index to the next location
previous[previous_index] = record
previous_index += 1
This creates a very basic array of length previous_max and just stores the oldest data at index 0 and newest at previous_max -1.

Related

How to optimize the finding of divergences between 2 signals

I am trying to create an indicator that will find all the divergences between 2 signals.
The output of the function so far looks like this
But the problem is that is painfully slow when I am trying to use it with long signals. Could any of you guys help me to make it faster if is possible?
My code:
def find_divergence(price: pd.Series, indicator: pd.Series, width_divergence: int, order: int):
div = pd.DataFrame(index=range(price.size), columns=[
f"Bullish_{width_divergence}_{order}",
f"Berish_{width_divergence}_{order}"
])
div[f'Bullish_idx_{width_divergence}_{order}'] = False
div[f'Berish_idx_{width_divergence}_{order}'] = False
def calc_argrelextrema(price_: np.numarray):
return argrelextrema(price_, np.less_equal, order=order)[0]
price_ranges = []
for i in range(len(price)):
price_ranges.append(price.values[0:i + 1])
f = []
with ThreadPoolExecutor(max_workers=16) as exe:
for i in price_ranges:
f.append(exe.submit(calc_argrelextrema, i))
prices_lows = SortedSet()
for r in concurrent.futures.as_completed(f):
data = r.result()
for d in reversed(data):
if d not in prices_lows:
prices_lows.add(d)
else:
break
price_lows_idx = pd.Series(prices_lows)
for idx_1 in range(price_lows_idx.size):
min_price = price[price_lows_idx[idx_1]]
min_indicator = indicator[price_lows_idx[idx_1]]
for idx_2 in range(idx_1 + 1, idx_1 + width_divergence):
if idx_2 >= price_lows_idx.size:
break
if price[price_lows_idx[idx_2]] < min_price:
min_price = price[price_lows_idx[idx_2]]
if indicator[price_lows_idx[idx_2]] < min_indicator:
min_indicator = indicator[price_lows_idx[idx_2]]
consistency_price_rd = min_price == price[price_lows_idx[idx_2]]
consistency_indicator_rd = min_indicator == indicator[price_lows_idx[idx_1]]
consistency_price_hd = min_price == price[price_lows_idx[idx_1]]
consistency_indicator_hd = min_indicator == indicator[price_lows_idx[idx_2]]
diff_price = price[price_lows_idx[idx_1]] - price[price_lows_idx[idx_2]] # should be neg
diff_indicator = indicator[price_lows_idx[idx_1]] - indicator[price_lows_idx[idx_2]] # should be pos
is_regular_divergence = diff_price > 0 and diff_indicator < 0
is_hidden_divergence = diff_price < 0 and diff_indicator > 0
if is_regular_divergence and consistency_price_rd and consistency_indicator_rd:
div.at[price_lows_idx[idx_2], f'Bullish_{width_divergence}_{order}'] = (price_lows_idx[idx_1], price_lows_idx[idx_2])
div.at[price_lows_idx[idx_2], f'Bullish_idx_{width_divergence}_{order}'] = True
elif is_hidden_divergence and consistency_price_hd and consistency_indicator_hd:
div.at[price_lows_idx[idx_2], f'Berish_{width_divergence}_{order}'] = (price_lows_idx[idx_1], price_lows_idx[idx_2])
div.at[price_lows_idx[idx_2], f'Berish_idx_{width_divergence}_{order}'] = True
return div

Single list.count instead of multiple

Im parsed list of crew witch one looks like:
20;mechanic;0;68
21;cook;0;43
22;scientist;0;79
23;manager;1;65
24;mechanic;1;41
etc
And now I'm trying to figure out how to count number of workers who have 60 or more stamina( the last element in each employee )
There is my code:
with open('employee.txt', 'r') as employee_list:
count = 0
for employee in employee_list.readlines():
employee_data = employee.rstrip().split(';')
if int(employee_data[3]) >= 60:
count += 1
print(count)
Print from terminal:
1
2
3
...
90
And there is the right answer I think, but is there anyway to get only one 'total' count, not a 90ty strings ?
Just print one line after the loop is done.
with open('employee.txt', 'r') as employee_list:
count = 0
for employee in employee_list.readlines():
employee_data = employee.rstrip().split(';')
if int(employee_data[3]) >= 60:
count += 1
print(count)
But I would also recommend using pandas for data manipulation. For example:
df = pd.read_csv('employee.txt', sep=';')
df.columns = ['col1', 'col2', 'col3', 'stamina']
Then just filter and get the size:
df[df.stamina >= 60].size
So after a day of thinking I wrote this and get right answer ( maybe someone will find this helpful):
def total_resist_count():
# with open('employee.txt', 'r') as employee_list:
employee_list = [input() for i in range(120)]
candidates = []
for employee in employee_list:
employee_data = employee.rstrip().split(';')
if int(employee_data[3]) >= 60:
candidates.append(employee_data)
return candidates
required_professionals = {
'computers specialist': 5,
'cook': 3,
'doctor': 5,
'electrical engineer': 4,
'manager': 1,
'mechanic': 8,
'scientist': 14
}
expedition_total = 40
female_min = 21
male_min = 12
def validate_solution(cur_team, num_females, num_males):
global expedition_total, female_min, male_min
if sum(cur_team) != expedition_total or num_females < female_min or num_males < male_min:
return False
num_of_free_vacancies = 0
for k in required_professionals:
num_of_free_vacancies += required_professionals[k]
if num_of_free_vacancies > 0:
return False
return True
TEAM = None
def backtrack(candidates, cur_team, num_females, num_males):
global required_professionals, expedition_total, TEAM
if sum(cur_team) > expedition_total or TEAM is not None:
return
if validate_solution(cur_team, num_females, num_males):
team = []
for i, used in enumerate(cur_team):
if used == 1:
team.append(candidates[i])
TEAM = team
return
for i in range(len(candidates)):
if cur_team[i] == 0 and required_professionals[candidates[i][1]] > 0:
cur_team[i] = 1
required_professionals[candidates[i][1]] -= 1
if candidates[i][2] == '1':
backtrack(candidates, cur_team, num_females, num_males + 1)
else:
backtrack(candidates, cur_team, num_females + 1, num_males)
required_professionals[candidates[i][1]] += 1
cur_team[i] = 0
if __name__ == '__main__':
ec = decode_fcc_message()
candidates = total_resist_count(ec)
cur_team = [0] * len(candidates)
backtrack(candidates, cur_team, 0, 0)
s = ""
for t in TEAM:
s += str(t[0]) + ';'
print(s)

How to standardize address type properly

I'm trying to standardize street address by converting the abbreviations to the full word (e.g. RD - Road). I created many lines to account for different spellings and ran into an issue where one replace code overrode another one
import pandas as pd
mydata = {'Street_type': ['PL', 'pl', 'Pl', 'PLACE', 'place']}
mydata = pd.DataFrame(mydata)
mydata['Street_type'] = mydata['Street_type'].replace('PL','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('Pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('PLACE','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('place','Place',regex=True)
Instead of Place, I got Placeace. What is the best way to avoid this error? Do I write a if-else statement or any function? Thanks in advance!
Among other problems, you have overlapping logic: you fail to check that the target ("old") string is a full word before you replace it. For instance, with the input type of "PLACE", you trigger both the first and third replacements, generating PlaceACE and then PlaceaceACE before you get to the condition you wanted.
You need to work through your tracking and exclusion logic carefully, and then apply only one of the replacements. You can check the length of the street_type and apply the unique transition you need for that length.
If you're trying to convert a case statement, then you need to follow that logic pattern, rather than the successive applications you coded. You can easily look up how to simulate a "case" statement in Python.
Also consider using a translation dictionary, such as
type_trans = {
"pl": "Place",
"Pl": "Place",
"PLACE": "Place",
...
}
Then your change is simply
mydata['Street_type'] = type_trans[mydata['Street_type']]
Also, you might list all of the variants in a tuple, such as:
type_place = ("PL", "Pl", "pl", "PLACE", "place")
if mydata['Street_type'] in type_place
mydata['Street_type'] = "Place"
... but be sure to generalize this properly for your entire list of street types.
You can do this correctly with a single pass if you use a proper regex here, e.g. use word boundaries (\b):
In [11]: places = ["PL", "pl", "Pl", "PLACE", "Place", "place"]
In [12]: mydata.Street_type
Out[12]:
0 PL
1 pl
2 Pl
3 PLACE
4 place
Name: Street_type, dtype: object
In [13]: mydata.Street_type.replace("(^|\b)({})(\b|$)".format("|".join(places)), "Place", regex=True)
Out[13]:
0 Place
1 Place
2 Place
3 Place
4 Place
Name: Street_type, dtype: object
#Needlemanwunch
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval
match_award = 10
mismatch_penalty = -3
gap_penalty = -4 # both for opening and extanding
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty
def finalize(align1, align2):
align1 = align1[::-1] #reverse sequence 1
align2 = align2[::-1] #reverse sequence 2
i,j = 0,0
#calcuate identity, score and aligned sequeces
symbol = ''
found = 0
score = 0
identity = 0
for i in range(0,len(align1)):
# if two AAs are the same, then output the letter
if align1[i] == align2[i]:
symbol = symbol + align1[i]
identity = identity + 1
score += match_score(align1[i], align2[i])
# if they are not identical and none of them is gap
elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-':
score += match_score(align1[i], align2[i])
symbol += ' '
found = 0
#if one of them is a gap, output a space
elif align1[i] == '-' or align2[i] == '-':
symbol += ' '
score += gap_penalty
identity = float(identity) / len(align1) * 100
print('Similarity =', "%3.3f" % identity, 'percent')
print('Score =', score)
# print(align1)
# print(symbol)
# print(align2)
def needle(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
# Calculate DP table
for i in range(0, m + 1):
score[i][0] = gap_penalty * i
for j in range(0, n + 1):
score[0][j] = gap_penalty * j
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
delete = score[i - 1][j] + gap_penalty
insert = score[i][j - 1] + gap_penalty
score[i][j] = max(match, delete, insert)
# Traceback and compute the alignment
align1, align2 = '', ''
i,j = m,n # start from the bottom right cell
while i > 0 and j > 0: # end toching the top or the left edge
score_current = score[i][j]
score_diagonal = score[i-1][j-1]
score_up = score[i][j-1]
score_left = score[i-1][j]
if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
align1 += seq1[i-1]
align2 += seq2[j-1]
i -= 1
j -= 1
elif score_current == score_left + gap_penalty:
align1 += seq1[i-1]
align2 += '-'
i -= 1
elif score_current == score_up + gap_penalty:
align1 += '-'
align2 += seq2[j-1]
j -= 1
# Finish tracing up to the top left cell
while i > 0:
align1 += seq1[i-1]
align2 += '-'
i -= 1
while j > 0:
align1 += '-'
align2 += seq2[j-1]
j -= 1
finalize(align1, align2)
needle('kizlerlo','killerpo' )
***********************************************************************************************************************
#import textdistance as txd
import numpy
txd.overlap('kizlerlo','kilerpo' )
txd.jaro('kizlerlo','killerpo' )
txd.cosine('kizlerlo','killerpo' )
#txd.needleman_wunsch('kizlerlo','killerpo' )
txd.jaro_winkler('kizlerlo','killerpo' )
#txd.smith_waterman('Loans and Accounts','Loans Accounts' )
#txd.levenshtein.normalized_similarity('Loans and Accounts','Loans Accounts' )
from scipy.spatial import distance
a = 'kizlerlo'
b = 'kilerpoo'
#txd.gotoh('Loans and Accounts','Loans Accounts' )
print(txd.needleman_wunsch.normalized_similarity('Loans and Accounts','Loans Accounts' ))
***************************************************************************************************************************
#Euclidean
import math
import numpy as np
def euclid(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
dist=1/(1+math.sqrt(sum([(a - b) ** 2 for a, b in zip(vec1, vec2)])))
print(dist)
euclid('kizlerlo','killerpo')
***************************************************************************************************************************
from similarity.qgram import QGram
import affinegap
qgram = QGram(2)
#print(qgram.distance('kizlerlo', 'killerpo'))
affinegap.affineGapDistance('kizlerlokill' ,'erpozlerlzler')
***************************************************************************************************************************
#manhattan
def manhattan(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
#dist= sum([np.abs(a - b) for a, b in zip(vec1, vec2)])
dist=1/(1+sum([np.abs(a - b) for a, b in zip(vec1, vec2)]))
print(dist)
manhattan('kizlerlo','killerpo')
import jellyfish
import json
from Levenshtein import distance,jaro_winkler,jaro,ratio,seqratio
def comp(a,b):
return jellyfish.jaro_winkler(a,b)*100 + distance(a,b) + jaro(a,b)*100
ip = {"CED":"WALMART INC_10958553"}
ala = {}
for index,row in df_ala.iterrows():
a = ip.get("CED")
b = row['NN_UID']
c = comp(a,b)
ala.update({row['N_UID'] : c})
ala_max = max(ala, key=ala.get)
ala_f = {"ALACRA" : ala_max}
ces_f = {"CESIUM" : "WALMART_10958553_CESIUM"}
dun_f = {"DUNS" : "WALMART_10958053_DUNS"}
ref_f = {"REF" : "WALMART INC_10958553_REF"}
cax_f = {"CAX" : "WALMART LTD_10958553_CAX"}
final_op = {**ala_f,**ces_f,**dun_f,**ref_f,**cax_f }
final_json = json.dumps(final_op)
print(final_json)
from flask import Flask,request, jsonify
app = Flask(__name__)
#app.route('/test',methods = ['GET','POST'])
def test():
if request.method == "GET":
return jsonify({"response":"Get request called"})
elif request.method == "POST":
req_Json = request.json
name = req_Json['name']
return jsonify({"response": "Hi" + name})
if __name__ == '__main__':
app.run(debug = True,port = 9090)
{
"name": "Mike"
}
import usaddress
import pandas as pd
import statistics
#sa = dict(usaddress.parse('123 Main St. Suite Chicago, IL' ))
adr = pd.read_excel('C:\\VINAYAK\\Address.xlsx')
adr.columns = ['Address']
strlen = []
scr = []
loop = adr['Address'].tolist()
for i in loop:
strlen.append(len(i))
x = statistics.median(strlen)
for i in loop:
sa = dict(usaddress.parse(i))
sa = list(sa.values())
a = 0
if len(i) > x :
a+= 5
if 'AddressNumber' in sa :
a+= 23
if 'StreetName' in sa :
#a = a + 20
a+= 17
if 'OccupancyType' in sa :
a+= 6
if 'OccupancyIdentifier' in sa :
a+= 12
if 'PlaceName' in sa :
a+= 12
if 'StateName' in sa :
a+= 13
if 'ZipCode' in sa :
a+= 12
scr.append(a)
adr['Adr_Score'] = scr
adr.head()
#(pd.DataFrame([(key) for key in sa.items()])).transpose()
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
# df_ts = pd.DataFrame(columns = ['AddressNumber' , 'Age', 'City' , 'Country'])
# df_ts.append(sa, ignore_index=False, verify_integrity=False, sort=None)
# df_ts.head()
import pandas as pd
from zipfile import ZipFile
# core = []
# f = open('C:/Users/s.natarajakarayalar/1.txt','r')
# core.append(str(f.readlines()))
# print(core)
import os
import zipfile
import re
import nltk
import os
core = []
with zipfile.ZipFile('C:/Users/s.natarajakarayalar/TF.zip') as z:
a = 0
for filename in z.namelist():
#if a < 1:
#if not os.path.isdir(filename):
# read the file
with z.open(filename) as f:
#a = 2
x = f.readlines()
core = core + x
with open('C:/Users/s.natarajakarayalar/fins.txt', 'w') as f:
for item in core:
f.write("%s\n" % item)
# for i in core:
# if k < 5:
# tkt = re.sub(r'.*CONTENT', '', i)
# new_core.append(tkt)
# k = k+1
# for item in core:
# new_core.append(len(item.split()))
# print(sum(new_core))
# from nltk.tokenize import word_tokenize
# new_core = []
# stp = ['URL:https://','TITLE:b','META-KEYWORDS:','None','DOC ID:','CONTENT:b','URL:','TITLE:','META-CONTENT:']
# #new_core = [word for word in core if word not in stopwords]
# for i in core:
# wk = word_tokenize(i)
# for w in wk:
# if w not in stp:
# new_core.append(w)

For loop for finding concordance is taking a lot of time for large data. (14+hrs for 0.15mln * 36k rows)

I am running this code in python3.5 to find Concordance (logistic regression).
for i in (ones2.index):
for j in (zeros2.index):
pairs_tested = pairs_tested+1
if(ones2.iloc[i,1] > zeros2.iloc[j,1]):
conc = conc+1
elif(ones2.iloc[i,1]==zeros2.iloc[j,1]):
ties = ties+1
else:
disc = disc+1
# Calculate concordance, discordance and ties
concordance = conc/pairs_tested
discordance = disc/pairs_tested
ties_perc = ties/pairs_tested
print("Concordance = %r", concordance)
print("Discordance = %r", discordance)
print("Tied = %r", ties_perc)
print("Pairs = %r", pairs_tested)
There are 0.15mln rows in zeros2(panda dataframe) and 36k rows in ones2(panda dataframe). Both the tables have two variables
[i] Responders (Responder0 = 0 in zeros2 and Responders1 = 1 in ones2).
[ii] Probabilities (prob0 in zeros2 and prob1 in ones2).
My question is: The for loop has taken 12hours and still running at the time when this question is being asked. Need help. how to perform this operation faster. I am running this on a windows 64bit machine with 8GB RAM.
Your code is doing 5.4 billion calculations due to the two for loops (0.15 mil * 36k):
I would do something like this: (Thanks to #Leon for helping me make this answer better)
from bisect import bisect_left, bisect_right
zeros_list = sorted([zeros2.iloc[j,1] for j in zeros2.index])
zeros2_length = len(zeros2_list)
for i in ones2.index:
cur_disc = bisect_left(zeros2_list, ones2.iloc[i,1])
cur_ties = bisect_right(zeros2_list, ones2.iloc[i,1]) - cur_disc
disc += cur_disc
ties += cur_ties
conc += zeros2_length - cur_ties - cur_disc
pairs_tested = zeros2_length * len(ones2.index)
concordance = conc/pairs_tested
discordance = disc/pairs_tested
ties_perc = ties/pairs_tested
print("Concordance = %r", concordance)
print("Discordance = %r", discordance)
print("Tied = %r", ties_perc)
print("Pairs = %r", pairs_tested
Or, the other way round, like this:
zeros_list = sorted([zeros2.iloc[j,1] for j in zeros2.index])
ones2_list = sorted([ones2.iloc[i,1] for i in ones2.index])
zeros2_length = len(zeros2_list)
ones2_length = len(ones2_list)
for i in zeros2.index:
cur_conc = bisect_left(ones2_list, zeros2.iloc[i,1])
cur_ties = bisect_right(ones2_list, zeros2.iloc[i,1]) - cur_conc
conc += cur_conc
ties += cur_ties
disc += ones2_length - cur_ties - cur_conc
# We could also achieve the above like this too:
# for i in zeros2_list:
# cur_conc = bisect_left(ones2_list, i)
# cur_ties = bisect_right(ones2_list, i) - cur_conc
# conc += cur_conc
# ties += cur_ties
# disc += ones2_length - cur_ties - cur_conc
pairs_tested = zeros2_length * ones2_length
concordance = conc/pairs_tested
discordance = disc/pairs_tested
ties_perc = ties/pairs_tested
print("Concordance = %r", concordance)
print("Discordance = %r", discordance)
print("Tied = %r", ties_perc)
print("Pairs = %r", pairs_tested
Probability = model.predict_proba(data[predictors])
Probability1 = pd.DataFrame(Probability)
Probability1.columns = ['Prob_LoanStatus_0','Prob_LoanStatus_1']
TruthTable = pd.merge(data[[outcome]], Probability1[['Prob_LoanStatus_1']], how='inner', left_index=True, right_index=True)
zeros = TruthTable[(TruthTable['Loan_Status']==0)].reset_index().drop(['index'], axis = 1)
ones = TruthTable[(TruthTable['Loan_Status']==1)].reset_index().drop(['index'], axis = 1)
from bisect import bisect_left, bisect_right
zeros_list = sorted([zeros.iloc[j,1] for j in zeros.index])
zeros_length = len(zeros_list)
disc = 0
ties = 0
conc = 0
for i in ones.index:
cur_conc = bisect_left(zeros_list, ones.iloc[i,1])
cur_ties = bisect_right(zeros_list, ones.iloc[i,1]) - cur_conc
conc += cur_conc
ties += cur_ties
pairs_tested = zeros_length * len(ones.index)
disc = pairs_tested - conc - ties
print("Pairs = ", pairs_tested)
print("Conc = ", conc)
print("Disc = ", disc)
print("Tied = ", ties)
concordance = conc/pairs_tested
discordance = disc/pairs_tested
ties_perc = ties/pairs_tested
print("Concordance = %r", concordance)
print("Discordance = %r", discordance)
print("Tied = %r", ties_perc)
I followed the reply of Sreyantha Chary which was elegant, but the concordance percentage and discordance percentage was mixed up in the first part of the answer.

Memory overflow in Python

I have 67000 files, I need to read them and extract similarities between the words, but when I run the code my laptop becomes much slower, I can't open any other application, and then a memory overflow error shows up (even when I run on around 10 000 of the files). Is there a way to clear the memory after every for loop maybe, or will running the code on all files be impossible to do? Below is the code:
def isAscii(s):
for c in s:
if c not in string.printable:
return False
return True
windowSize = 2
relationTable = {}
probabilities = {}
wordCount = {}
totalWordCount = 0
def sim(w1, w2):
numerator = 0
denominator = 0
if (w1 in relationTable) and (w2 in relationTable):
rtw1 = {}
rtw2 = {}
rtw1 = relationTable[w1]
rtw2 = relationTable[w2]
for word in rtw1:
rtw1_PMI = rtw1[word]['pairPMI']
denominator += rtw1_PMI
if(word in rtw2):
rtw2_PMI = rtw2[word]['pairPMI']
numerator += (rtw1_PMI + rtw2_PMI)
for word in rtw2:
rtw2_PMI = rtw2[word]['pairPMI']
denominator += rtw2_PMI
if(denominator != 0):
return float(numerator)/denominator
else:
return 0
else:
return -1
AllNotes = {}
AllNotes = os.listdir("C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes")
fileStopPunctuations = open('C:/Users/nerry-san/Desktop/EECE 502/stopPunctuations.txt')
stopPunctuations = nltk.word_tokenize(fileStopPunctuations.read())
for x in range (0, 10):
fileToRead = open('C:/Users/nerry-san/Desktop/EECE 502/MedicalNotes/%s'%(AllNotes[x]))
case1 = fileToRead.read()
text = nltk.WordPunctTokenizer().tokenize(case1.lower())
final_text = []
for index in range(len(text)):
word = text[index]
if (word not in stopPunctuations):
final_text.append(word)
for index in range (len(final_text)):
w1 = final_text[index]
if(isAscii(w1)):
for index2 in range(-windowSize, windowSize+1):
if (index2 != 0):
if ( index + index2 ) in range (0, len(final_text)):
w2 = final_text[index + index2]
if(isAscii(w2)):
totalWordCount += 1
if (w1 not in wordCount):
wordCount[w1] = {}
wordCount[w1]['wCount'] = 0
try:
wordCount[w1][w2]['count'] += 1
wordCount[w1]['wCount'] += 1
except KeyError:
wordCount[w1][w2] = {'count':1}
wordCount[w1]['wCount'] += 1
for word in wordCount:
probabilities[word]={}
probabilities[word]['wordProb'] = float (wordCount[word]['wCount'])/ totalWordCount
for word in wordCount:
relationTable[word] = {}
for word2 in wordCount[word]:
if ( word2 != 'wCount'):
pairProb = float(wordCount[word][word2]['count'])/(wordCount[word]['wCount'])
relationTable[word][word2] = {}
relationTable[word][word2]['pairPMI'] = math.log(float(pairProb)/(probabilities[word]['wordProb'] * probabilities[word2]['wordProb']),2)
l = []
for word in relationTable:
l.append(word)
for index in range (0, len(l)):
word = l[index]
simValues = []
for index2 in range (0, len(l)):
word2 = l[index2]
if(word!= word2):
simVal = sim(word,word2)
if(simVal > 0):
simValues.append([word2, simVal])
simValues.sort(key= operator.itemgetter(1), reverse = True)
Every time you open a file, use the "with" statement. This will ensure the file is closed when the loop finishes (or rather when the with block is exited.

Categories

Resources