Making Excel histograms using python - python

This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))

Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

Related

PDF splitting with Bookmarks in python through PyPDF4 - bookmarks are losing in the output

I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct
book mark is losing in the separated pdf, if original pdf contains bookmark
if pdf contains the page labels with Roman and arabic page numbers,
like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)
how to correct the issue in the below script
MWE
import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *
#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme
pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()
print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)
print(pgcnt)
pno = []
def value(rno):
r = rno.upper()
if (r == 'I'):
return 1
if (r == 'V'):
return 5
if (r == 'X'):
return 10
if (r == 'L'):
return 50
if (r == 'C'):
return 100
if (r == 'D'):
return 500
if (r == 'M'):
return 1000
return -1
def romanToDecimal(str):
res = 0
i = 0
while (i < len(str)):
# Getting value of symbol s[i]
s1 = value(str[i])
if (i + 1 < len(str)):
# Getting value of symbol s[i + 1]
s2 = value(str[i + 1])
# Comparing both values
if (s1 >= s2):
# Value of current symbol is greater
# or equal to the next symbol
res = res + s1
i = i + 1
else:
# Value of current symbol is greater
# or equal to the next symbol
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
def get_pageInfo(pginfo):
global pno
for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
Start_page = m.group(2)
End_page = m.group(3)
x = Start_page
y = End_page
numeric_test = x.isnumeric()
if not numeric_test:
Start_page = romanToDecimal(Start_page)
else:
Start_page = int(Start_page)
numeric_test = y.isnumeric()
if not numeric_test:
End_page = romanToDecimal(End_page)
else:
End_page = int(End_page)
print(x, Start_page, y, End_page)
pno.append((Start_page,End_page))
return pno
pgdetails = get_pageInfo(pgcnt)
print(pgdetails)
def pdf_splitter(file,start,end,fcount):
fix_start = start
#we will save new splited pdf as "nameofpdf splitted.pdf"
#example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
new_file_name = str(fcount)+".pdf"
read_file = PdfFileReader(open(file,"rb")) #read pdf
new_pdf = PdfFileWriter() #create write object
start-=1
try:
with open(new_file_name,"wb") as f:
for i in range(start, end):
new_pdf.addPage(read_file.getPage(i))
new_pdf.write(f)
i+=1
f.close()
print("PDF splitted Successfully")
reader = PdfReader(new_file_name)
labels = PageLabels.from_pdf(reader)
newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
style="roman lowercase", # See options in PageLabelScheme.styles()
prefix="",
firstpagenum=fix_start) # number to attribute to the first page of this index
labels.append(newlabel) # Adding our page labels to the existing ones
labels.write(reader)
writer = PdfWriter()
writer.trailer = reader
writer.write(new_file_name)
except Exception as e:
print(e)
x = 0
for i in pgdetails:
x += 1
#pvalaue = i
Start,End = i
pdf_splitter('input.pdf',Start,End,x)
sys.exit()
and the page information file (txt) will contain the below information
<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>
Thanks in Advance

Split json file into multiple csv files depending on date?

I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1

Export data efficiently to CSV using python

I am using my arduino to analyze analog inputs and I am accessing the arduino using the pyfirmata library and I ambasically measuring voltages using the 6 analog inputs on my arduino Uno. I need to find a way to live time feed this data into a CSV efficiently... I am not sure on the best way to do that
Any suggestion would help but please write out the code you suggest. I would prefer to use Pandas if possible because it's easier
voltage0 through voltage5 are my variables and I am trying to report those in a nice format that will later have to be analyzed
import time
from datetime import datetime
import pyfirmata
import pandas as pd
board = pyfirmata.Arduino('/dev/ttyACM1')
analog_pin0 = board.get_pin('a:0:i')
analog_pin1 = board.get_pin('a:1:i')
analog_pin2 = board.get_pin('a:2:i')
analog_pin3 = board.get_pin('a:3:i')
analog_pin4 = board.get_pin('a:4:i')
analog_pin5 = board.get_pin('a:5:i')
it = pyfirmata.util.Iterator(board)
it.start()
analog_pin0.enable_reporting()
analog_pin1.enable_reporting()
analog_pin2.enable_reporting()
analog_pin3.enable_reporting()
analog_pin4.enable_reporting()
analog_pin5.enable_reporting()
data = []
count = 0
x = 0
start = 0
while x <= 1000:
reading0 = analog_pin0.read()
if reading0 != None:
voltage0 = reading0 * 5
voltage0 = round(voltage0,2)
else:
voltage0 = float('nan')
reading1 = analog_pin1.read()
if reading1 != None:
voltage1 = reading1 * 5
voltage1 = round(voltage1,2)
else:
voltage1 = float('nan')
reading2 = analog_pin2.read()
if reading2 != None:
voltage2 = reading2 * 5
voltage2 = round(voltage2,2)
else:
voltage2 = float('nan')
reading3 = analog_pin3.read()
if reading3 != None:
voltage3 = reading3 * 5
voltage3 = round(voltage3,2)
else:
voltage3 = float('nan')
reading4 = analog_pin4.read()
if reading4 != None:
voltage4 = reading4 * 5
voltage4 = round(voltage4,2)
else:
voltage4 = float('nan')
reading5 = analog_pin5.read()
if reading5 != None:
voltage5 = reading5 * 5
voltage5 = round(voltage5,2)
else:
voltage5 = float('nan')
datarow = {'Voltage0': voltage0, 'Voltage1': voltage1, 'Voltage2' : voltage2, 'Voltage3': voltage3, 'Voltage4' : voltage4, 'Voltage5' : voltage5, 'Time' : time.strftime("%Y-%m-%d_%H:%M:%S")}
data.append(datarow)
if count%500 == 0:
dataframe = pd.DataFrame(data)
dataframe.to_csv('data.csv')
x += 1
count += 1
#time.sleep(1)enter code here
Your code seems to work, but it's not very efficient. Every 500 iterations, you rewrite all your data instead of updating your file with the new data in the end. You might consider saving it this way instead:
if count%500 == 0:
dataframe = pd.DataFrame(data)
dataframe.to_csv('data.csv',mode='a',header=False)
data = []
If it's still not fast enough, you might consider saving your data to a binary format such as .npy (numpy format), and convert it later to csv.

How to standardize address type properly

I'm trying to standardize street address by converting the abbreviations to the full word (e.g. RD - Road). I created many lines to account for different spellings and ran into an issue where one replace code overrode another one
import pandas as pd
mydata = {'Street_type': ['PL', 'pl', 'Pl', 'PLACE', 'place']}
mydata = pd.DataFrame(mydata)
mydata['Street_type'] = mydata['Street_type'].replace('PL','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('Pl','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('PLACE','Place',regex=True)
mydata['Street_type'] = mydata['Street_type'].replace('place','Place',regex=True)
Instead of Place, I got Placeace. What is the best way to avoid this error? Do I write a if-else statement or any function? Thanks in advance!
Among other problems, you have overlapping logic: you fail to check that the target ("old") string is a full word before you replace it. For instance, with the input type of "PLACE", you trigger both the first and third replacements, generating PlaceACE and then PlaceaceACE before you get to the condition you wanted.
You need to work through your tracking and exclusion logic carefully, and then apply only one of the replacements. You can check the length of the street_type and apply the unique transition you need for that length.
If you're trying to convert a case statement, then you need to follow that logic pattern, rather than the successive applications you coded. You can easily look up how to simulate a "case" statement in Python.
Also consider using a translation dictionary, such as
type_trans = {
"pl": "Place",
"Pl": "Place",
"PLACE": "Place",
...
}
Then your change is simply
mydata['Street_type'] = type_trans[mydata['Street_type']]
Also, you might list all of the variants in a tuple, such as:
type_place = ("PL", "Pl", "pl", "PLACE", "place")
if mydata['Street_type'] in type_place
mydata['Street_type'] = "Place"
... but be sure to generalize this properly for your entire list of street types.
You can do this correctly with a single pass if you use a proper regex here, e.g. use word boundaries (\b):
In [11]: places = ["PL", "pl", "Pl", "PLACE", "Place", "place"]
In [12]: mydata.Street_type
Out[12]:
0 PL
1 pl
2 Pl
3 PLACE
4 place
Name: Street_type, dtype: object
In [13]: mydata.Street_type.replace("(^|\b)({})(\b|$)".format("|".join(places)), "Place", regex=True)
Out[13]:
0 Place
1 Place
2 Place
3 Place
4 Place
Name: Street_type, dtype: object
#Needlemanwunch
def zeros(shape):
retval = []
for x in range(shape[0]):
retval.append([])
for y in range(shape[1]):
retval[-1].append(0)
return retval
match_award = 10
mismatch_penalty = -3
gap_penalty = -4 # both for opening and extanding
def match_score(alpha, beta):
if alpha == beta:
return match_award
elif alpha == '-' or beta == '-':
return gap_penalty
else:
return mismatch_penalty
def finalize(align1, align2):
align1 = align1[::-1] #reverse sequence 1
align2 = align2[::-1] #reverse sequence 2
i,j = 0,0
#calcuate identity, score and aligned sequeces
symbol = ''
found = 0
score = 0
identity = 0
for i in range(0,len(align1)):
# if two AAs are the same, then output the letter
if align1[i] == align2[i]:
symbol = symbol + align1[i]
identity = identity + 1
score += match_score(align1[i], align2[i])
# if they are not identical and none of them is gap
elif align1[i] != align2[i] and align1[i] != '-' and align2[i] != '-':
score += match_score(align1[i], align2[i])
symbol += ' '
found = 0
#if one of them is a gap, output a space
elif align1[i] == '-' or align2[i] == '-':
symbol += ' '
score += gap_penalty
identity = float(identity) / len(align1) * 100
print('Similarity =', "%3.3f" % identity, 'percent')
print('Score =', score)
# print(align1)
# print(symbol)
# print(align2)
def needle(seq1, seq2):
m, n = len(seq1), len(seq2) # length of two sequences
# Generate DP table and traceback path pointer matrix
score = zeros((m+1, n+1)) # the DP table
# Calculate DP table
for i in range(0, m + 1):
score[i][0] = gap_penalty * i
for j in range(0, n + 1):
score[0][j] = gap_penalty * j
for i in range(1, m + 1):
for j in range(1, n + 1):
match = score[i - 1][j - 1] + match_score(seq1[i-1], seq2[j-1])
delete = score[i - 1][j] + gap_penalty
insert = score[i][j - 1] + gap_penalty
score[i][j] = max(match, delete, insert)
# Traceback and compute the alignment
align1, align2 = '', ''
i,j = m,n # start from the bottom right cell
while i > 0 and j > 0: # end toching the top or the left edge
score_current = score[i][j]
score_diagonal = score[i-1][j-1]
score_up = score[i][j-1]
score_left = score[i-1][j]
if score_current == score_diagonal + match_score(seq1[i-1], seq2[j-1]):
align1 += seq1[i-1]
align2 += seq2[j-1]
i -= 1
j -= 1
elif score_current == score_left + gap_penalty:
align1 += seq1[i-1]
align2 += '-'
i -= 1
elif score_current == score_up + gap_penalty:
align1 += '-'
align2 += seq2[j-1]
j -= 1
# Finish tracing up to the top left cell
while i > 0:
align1 += seq1[i-1]
align2 += '-'
i -= 1
while j > 0:
align1 += '-'
align2 += seq2[j-1]
j -= 1
finalize(align1, align2)
needle('kizlerlo','killerpo' )
***********************************************************************************************************************
#import textdistance as txd
import numpy
txd.overlap('kizlerlo','kilerpo' )
txd.jaro('kizlerlo','killerpo' )
txd.cosine('kizlerlo','killerpo' )
#txd.needleman_wunsch('kizlerlo','killerpo' )
txd.jaro_winkler('kizlerlo','killerpo' )
#txd.smith_waterman('Loans and Accounts','Loans Accounts' )
#txd.levenshtein.normalized_similarity('Loans and Accounts','Loans Accounts' )
from scipy.spatial import distance
a = 'kizlerlo'
b = 'kilerpoo'
#txd.gotoh('Loans and Accounts','Loans Accounts' )
print(txd.needleman_wunsch.normalized_similarity('Loans and Accounts','Loans Accounts' ))
***************************************************************************************************************************
#Euclidean
import math
import numpy as np
def euclid(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
dist=1/(1+math.sqrt(sum([(a - b) ** 2 for a, b in zip(vec1, vec2)])))
print(dist)
euclid('kizlerlo','killerpo')
***************************************************************************************************************************
from similarity.qgram import QGram
import affinegap
qgram = QGram(2)
#print(qgram.distance('kizlerlo', 'killerpo'))
affinegap.affineGapDistance('kizlerlokill' ,'erpozlerlzler')
***************************************************************************************************************************
#manhattan
def manhattan(str1,str2):
dist=0.0
x=str1
y=str2
set1=set()
for a in range(0,len(x)):
set1.add(x[a])
for a in range(0,len(y)):
set1.add(y[a])
vec1=[None]*len(set1)
vec2=[None]*len(set1)
for counter,each_char in enumerate(set1):
vec1[counter]=x.count(each_char)
vec2[counter]=y.count(each_char)
#dist= sum([np.abs(a - b) for a, b in zip(vec1, vec2)])
dist=1/(1+sum([np.abs(a - b) for a, b in zip(vec1, vec2)]))
print(dist)
manhattan('kizlerlo','killerpo')
import jellyfish
import json
from Levenshtein import distance,jaro_winkler,jaro,ratio,seqratio
def comp(a,b):
return jellyfish.jaro_winkler(a,b)*100 + distance(a,b) + jaro(a,b)*100
ip = {"CED":"WALMART INC_10958553"}
ala = {}
for index,row in df_ala.iterrows():
a = ip.get("CED")
b = row['NN_UID']
c = comp(a,b)
ala.update({row['N_UID'] : c})
ala_max = max(ala, key=ala.get)
ala_f = {"ALACRA" : ala_max}
ces_f = {"CESIUM" : "WALMART_10958553_CESIUM"}
dun_f = {"DUNS" : "WALMART_10958053_DUNS"}
ref_f = {"REF" : "WALMART INC_10958553_REF"}
cax_f = {"CAX" : "WALMART LTD_10958553_CAX"}
final_op = {**ala_f,**ces_f,**dun_f,**ref_f,**cax_f }
final_json = json.dumps(final_op)
print(final_json)
from flask import Flask,request, jsonify
app = Flask(__name__)
#app.route('/test',methods = ['GET','POST'])
def test():
if request.method == "GET":
return jsonify({"response":"Get request called"})
elif request.method == "POST":
req_Json = request.json
name = req_Json['name']
return jsonify({"response": "Hi" + name})
if __name__ == '__main__':
app.run(debug = True,port = 9090)
{
"name": "Mike"
}
import usaddress
import pandas as pd
import statistics
#sa = dict(usaddress.parse('123 Main St. Suite Chicago, IL' ))
adr = pd.read_excel('C:\\VINAYAK\\Address.xlsx')
adr.columns = ['Address']
strlen = []
scr = []
loop = adr['Address'].tolist()
for i in loop:
strlen.append(len(i))
x = statistics.median(strlen)
for i in loop:
sa = dict(usaddress.parse(i))
sa = list(sa.values())
a = 0
if len(i) > x :
a+= 5
if 'AddressNumber' in sa :
a+= 23
if 'StreetName' in sa :
#a = a + 20
a+= 17
if 'OccupancyType' in sa :
a+= 6
if 'OccupancyIdentifier' in sa :
a+= 12
if 'PlaceName' in sa :
a+= 12
if 'StateName' in sa :
a+= 13
if 'ZipCode' in sa :
a+= 12
scr.append(a)
adr['Adr_Score'] = scr
adr.head()
#(pd.DataFrame([(key) for key in sa.items()])).transpose()
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
#pd.DataFrame(dict([(value, key) for key, value in sa.items()]))
# df_ts = pd.DataFrame(columns = ['AddressNumber' , 'Age', 'City' , 'Country'])
# df_ts.append(sa, ignore_index=False, verify_integrity=False, sort=None)
# df_ts.head()
import pandas as pd
from zipfile import ZipFile
# core = []
# f = open('C:/Users/s.natarajakarayalar/1.txt','r')
# core.append(str(f.readlines()))
# print(core)
import os
import zipfile
import re
import nltk
import os
core = []
with zipfile.ZipFile('C:/Users/s.natarajakarayalar/TF.zip') as z:
a = 0
for filename in z.namelist():
#if a < 1:
#if not os.path.isdir(filename):
# read the file
with z.open(filename) as f:
#a = 2
x = f.readlines()
core = core + x
with open('C:/Users/s.natarajakarayalar/fins.txt', 'w') as f:
for item in core:
f.write("%s\n" % item)
# for i in core:
# if k < 5:
# tkt = re.sub(r'.*CONTENT', '', i)
# new_core.append(tkt)
# k = k+1
# for item in core:
# new_core.append(len(item.split()))
# print(sum(new_core))
# from nltk.tokenize import word_tokenize
# new_core = []
# stp = ['URL:https://','TITLE:b','META-KEYWORDS:','None','DOC ID:','CONTENT:b','URL:','TITLE:','META-CONTENT:']
# #new_core = [word for word in core if word not in stopwords]
# for i in core:
# wk = word_tokenize(i)
# for w in wk:
# if w not in stp:
# new_core.append(w)

Add each new dictionary result in the order of the columns of a dataframe

I am new to Python, but hope to explain the issue.
dfrow - is a dictionary of a single regression summary
results - is an empty dataframe with same columns as in dfrow
I would like to save regression results for each observation in the outer loop at the same time making sure column order in the inner loop. I am getting a result for the first observations but cannot move further, error saying:
Traceback (most recent call last):
File "<stdin>", line 109, in <module>
TypeError: 'numpy.int64' object is not iterable
when I run this code
import pandas as pd
import numpy as np
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.stats import stattools as st
import statsmodels.api as sm
import collections
import datetime
import warnings
import scipy.stats
df_rent = import_rents()
df_return = import_ee_rets()
mostrecent = df_return.iloc[len(df_return) - 1]
mostrecentYYYY = mostrecent['Year']
mostrecentQ = mostrecent['Quarter']
mostrecentperiod = str(mostrecentYYYY) + "-Q" + str(mostrecentQ)
rentcols = df_rent.columns.values
colnames = []
#loop through the columns in df_rent until the column == the most recent period for which we have ee return data
for colname in rentcols:
if colname != mostrecentperiod:
colnames.append(colname)
else:
colnames.append(colname)
break
rentcols = colnames
#subset df_rent to only include columns that also have ee return data
df_rent = df_rent[rentcols]
#change dtype of metro_code / metro columns to string for matching later
df_rent['metro_code'] = df_rent['metro_code'].apply(str)
df_return['Metro'] = df_return['Metro'].apply(str)
df = pd.read_csv('//x/Project/_data/raw_data/rent_change.csv')
metros = list(np.unique(df['metro_code']))
regress_result_names = [
'metro',
'num_lag',
'num_ma',
'num_AR',
'beta_x1_retmov',
'x1_se',
'x1_tstat',
'x1_pval',
'r-squared',
'reg_fstat',
'fstat_pvalue',
'durbin-watson',
'resid_var']
regress_result_names = pd.Series(regress_result_names)
results = pd.DataFrame(columns=regress_result_names)
row = 0
for metro in metros:
for nlag in range(0, 5):
for nma in range(1, 11):
for AR in range(1, 5):
y = df_rent[df_rent['metro_code'] == str(metro)]
y = y.values.tolist()
y = y[0]
# delete first two columns of df_rent (they don't contain numeric data)
y.pop(0)
y.pop(0)
#y = rent time series data for specific metro
y = pd.Series(y)
#x1 = lagged moving average data for given params
df_return1 = df_return[df_return['Metro'] == str(metro)]
df_return1 = df_return1.reset_index(drop = True)
x1 = lagged_moving_avg(df = df_return1, metro_code = metro, nlag = nlag, nma = nma)
#y and x1 dataframe
y_label = 'y_Rent'
x_lagMA_label = 'x1_LaggedMA'
df1 = pd.DataFrame()
df1[y_label] = y
df1[x_lagMA_label] = x1
if mostrecentQ == 1:
currmonth = "01"
elif mostrecentQ == 2:
currmonth = "04"
elif mostrecentQ == 3:
currmonth = "07"
else:
currmonth = "10"
#convert index to datetime to run the regressions
currpd = pd.to_datetime((str(mostrecentYYYY) + currmonth), format='%Y%m')
df1.index = pd.date_range(*(pd.to_datetime(['1990-01', currpd]) + pd.offsets.QuarterEnd()), freq='Q')
#drop any rows that have missing observations
df1 = df1.dropna()
#df1.to_csv('//Nisfile01/x/Project - Real Estate Database/real_estate/odil/XandY.csv', index=True)
reg = ARIMA(endog = df1[y_label], order = (AR, 0,0)).fit(trend = 'nc', disp = 0, tol=1e-20)
resid_reg = reg.resid
reg2 = sm.OLS(resid_reg, df1[x_lagMA_label]).fit()
resid_reg2 = reg2.resid
dfrow = {
'metro': metro,
'num_lag': nlag,
'num_ma': nma,
'num_AR': AR,
'beta_x1_retmov': reg2.params[0],
'x1_se': reg2.bse[0],
'x1_tstat': reg2.tvalues[0],
'x1_pval': reg2.pvalues[0],
'r-squared': reg2.rsquared,
'reg_fstat':reg2.fvalue,
'fstat_pvalue': reg2.f_pvalue,
'durbin-watson': st.durbin_watson(reg2.resid),
'resid_var': resid_reg2.var(),
}
#create df for output called results
for key in dfrow.keys():
results.loc[row, key] = list(dfrow[key])
row = row + 1
Any help is very much appreciated.
P.S. Sorry for the messy code
The offending line is results.loc[row, key] = list(dfrow[key]).
You are trying to convert a single value, in this case a numpy.int64 object, to a list. I assume that what you're trying to do, and correct me if I am wrong, is create a singleton list with the int64 inside it. If that's what you want to do, you should use:
results.loc[row, key] = [dfrow[key]]

Categories

Resources