I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.
It's returning all lines of data2 back. I'm looking for it to return the same, or replaced lines of data1 back to me.
My Minimal, Reproducible Example:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
My full code
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2 query data(around 950 lines)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2 choices data(around 200 lines)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN
Okay, I'm not certain I've fully understood your issue, but modifying your reprex, I have produced the following solution.
import usaddress
from fuzzywuzzy import process
data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
parts = [
data1[0].get("StreetNamePreDirectional"),
data1[0].get("StreetName"),
data1[0].get("StreetNamePostType"),
]
street = " ".join([x for x in parts if x])
if street not in choices:
street = " ".join(
[data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
print(street)
This yields:
3176 DETROIT RD
So basically, it has replaces the street name bits with the best match from the choices list using the process.extract() function.
And here it is as a callable function:
import usaddress
from fuzzywuzzy import process
def best_street(addr: str, choices: list[str]) -> str:
try:
usaddr = usaddress.tag(addr)
except usaddress.RepeatedLabelError:
pass
street_parts = [
usaddr[0].get("StreetNamePreDirectional"),
usaddr[0].get("StreetName"),
usaddr[0].get("StreetNamePostType"),
]
street = " ".join([x for x in street_parts if x])
return " ".join(
[usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
if __name__ == "__main__":
choices = ["AIRPORT RD", "DETROIT RD"]
print(best_street("123 Detrt", choices))
print(best_street("9876 AIRPUMP DR", choices))
yielding:
123 DETROIT RD
9876 AIRPORT RD
My issue here was fuzzywuzzy requires you to pass an iterable so I had to add data2 = data2.split(',') to get the full strings to return.
My other issue was thinking I needed to use zip() to compare my files when zip() is for comparing parallel and not cross products.
Here is what I came up with that works. There is no issue with the code but fuzzywuzzy is not accurate enough for this tool to be practical to clean my data with the amount of typos in my address data.
If you can think of a way for me to clean up the if/else statement i'd be willing to hear it.
import os
import csv
import shutil
import usaddress
import pandas as pd
from fuzzywuzzy import process
with open(r"TEST_Cass_Howard.csv") as csv_file, \
open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
open(r"TEST_Unique_List.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader1 = csv.reader(f, delimiter=',')
correct = list(ul)
for line in csv_reader:
line = line[18]
line = line.upper()
if line == '' or line == ' ':
line = ''
else:
try:
addressbrk = usaddress.tag(line)
except usaddress.RepeatedLabelError:
addressbrk = line
line = addressbrk # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
if line != '':
roadnum2 = line[0].get('AddressNumber', '')
roadir2 = line[0].get('StreetNamePreDirectional', '')
roadname2 = line[0].get('StreetName', '')
roaddsg2 = line[0].get('StreetNamePostType', '')
else:
line = ''
if line != '':
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
else:
street2 = ''
if street2 != '':
if street2 not in correct:
street2 = process.extractOne(street2, correct)
else:
street2 = '\n'
if street2 != '':
tgthr = (roadnum2, street2[0])
else:
tgthr = ''
if tgthr != '':
final = (' '.join(tgthr))
else:
final = ('Null' + '\n')
f.writelines(final)
original = r"TEST_Cass_Howard.csv"
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')
Related
The text file looks like
data/File_10265.data:
Apple:2kg
Apple:3kg
Banana:1kg
Banana:4kg
Some string1
data/File_10276.data:
Apple:6kg
Apple:5kg
Apple:3kg
Banana:2kg
Banana:4kg
Banana:2kg
Banana:4kg
Extra line
data/File_10278.data:
Apple:3kg
Banana:2kg
Banana:4kg
Banana:2kg
Banana:7kg
Some words
The code is as follows:
import re
import pandas as pd
f = open("Samplefruit.txt", "r")
lines = f.readlines()
Apple_count=0
Banana_count=0
File_count=0
Filename_list=[]
Apple_list=[]
Banana_list=[]
for line in lines:
match1=re.findall('data/(?P<File>[^\/]+(?=\..*data))',line)
if match1:
Filename_list.append(match1[0])
print('Match found:',match1)
if line.startswith("Apple"):
Apple_count+=1
elif line.startswith("Banana"):
Banana_count+=1
Apple_list.append(Apple_count)
Banana_list.append(Banana_count)
df=pd.DataFrame({'Filename': Filename_list,'Apple':
Apple_list,'Banana':
Banana_list})
The desired output:
Filename: |Apple |Banana
File_10265|2 |2
File_10276|3 |4
File_10278|1 |4
Maybe there is a more efficient way to do this but here's one solution:
with open('filetest.txt') as f:
lines = f.readlines()
unique_lines = list(dict.fromkeys(lines))
for line in unique_lines:
print(line + str(lines.count(line)))
f1 = open('file.txt', 'a')
f1.write(line + str(lines.count(line)))
f1.close()
You simply open the file, read all lines into a list, then get rid of any duplicates. Then you loop through the list (now with the duplicates removed), and use the .count (docs) function to get the number of occurrences of each unique item in the list.
Try this,
pattern = re.compile(r"data/File_[\d]+.data:")
lines = text.split("\n")
files = itertools.groupby(lines, lambda line:pattern.search(line) == None)
for k, content in files:
if k == True:
content = list(content)
all_words = list(set(content))
counts = {word:content.count(word) for word in all_words if word != ""}
print(counts)
Output -
{'Banana:': 2, 'Apple:': 2}
{'Banana:': 4, 'Apple:': 3}
{'Banana:': 4, 'Apple:': 1}
NOTE: New changes have been made to the code as per the changes in the question.
Try this:
import re
text = {}
def unit_cal(val1, val2): #function to add quantities with units and return the final answer with units
q1 = re.findall("[0-9]+", val1)
unit = re.findall("[a-zA-Z]+", val1)
if (val2 != False):
q2 = re.findall("[0-9]+", val2)
ans = int(q1[0]) + int(q2[0])
else:
ans = int(q1[0])
return str(ans) + unit[0] #remove + unit[0] to return only the value
with open("item.txt", "r") as f1:
for line in f1:
if ("data" in line):
temp_key = line
k = {}
text[temp_key] = k
elif (line.strip() != ""):
temp_word = line.strip().split(":")
if temp_word[0] in text[temp_key]:
text[temp_key][temp_word[0]] = unit_cal(temp_word[1], text[temp_key][temp_word[0]])
else:
text[temp_key][temp_word[0]] = unit_cal(temp_word[1], False)
final_text = ""
for main_key in text:
final_text += main_key + "\n"
for sub_key in text[main_key]:
final_text += sub_key + " : " + str(text[main_key][sub_key]) + "\n\n"
print(final_text) #final output is displayed in the idle
with open("new_items.txt", "w") as f2:
f2.write(final_text) #the output is also written to a new file
Output:
data/File_10265.data:
Apple : 5kg
Banana : 5kg
data/File_10276.data:
Apple : 14kg
Banana : 12kg
data/File_10278.data:
Apple : 3kg
Banana : 15kg
Here, I have posted an answer. Thanks, #Mani,#CarySwoveland, #Zero, and #M B for your support. The code is as follows:
import pandas as pd
text = {}
with open(r"Samplefruit.txt", "r") as file:
for line in file:
if "data" in line:
Filename=line.split('/')[-1].split('.')[0]
Apple_count=0
Banana_count=0
print('----------------')
print(Filename)
elif ("Apple" in line or "Banana" in line):
if line.startswith("Apple"):
Apple_count+=1
elif line.startswith("Banana"):
Banana_count+=1
print('Apple:',Apple_count)
print('Banana:',Banana_count)
text[Filename] = {'Apple':Apple_count,'Banana':Banana_count}
File_list.append(Filename)
df = pd.DataFrame(
{"Filename": text.keys(), "Apple": [x['Apple'] for x in text.values()],"Banana": [x['Banana'] for x in text.values()]}
)
print(df)
I have a text file, 'student.txt'. Some keys have multiple values. I only want data that is tied to the name, and the sibling & hobby values below that name.
'student.txt'
ignore me
name-> Alice
name-> Sam
sibling-> Kate,
unwanted
sibling-> Luke,
hobby_1-> football
hobby_2-> games
name-> Ramsay
hobby_1-> dance
unwanted data
hobby_2-> swimming
hobby_3-> jogging
ignore data
Code I've done:
file = open("student.txt", "r")
with open("student.csv", "w") as writer:
main_dict = {}
student_dict = {"Siblings": "N/A", "Hobbies": "N/A"}
sibling_list = []
hobby_list = []
flag = True
writer.write ('name,siblings,hobbies\n')
header = 'Name,Siblings,Hobbies'.split(',')
sib_str = ''
hob_str =''
for eachline in file:
try:
key, value = eachline.split("-> ")
value = value.strip(",\n")
if flag:
if key == "name":
print (key,value)
if len(sibling_list) > 0:
main_dict[name]["Siblings"] = sib_str
#print (main_dict)
if len(hobby_list) > 0:
main_dict[name]["Hobbies"] = hob_str
sibling_list = []
hobby_list = []
name = value
main_dict[name] = student_dict.copy()
main_dict[name]["Name"] = name
elif key == "sibling":
sibling_list.append(value)
sib_str= ' '.join(sibling_list).replace(' ', '\n')
elif key.startswith("hobby"):
hobby_list.append(value)
hob_str = ' '.join(hobby_list)
if len(sibling_list) > 0:
main_dict[name]["Siblings"] = sib_str
if len(hobby_list) > 0:
main_dict[name]["Hobbies"] = hob_str
if 'name' in eachline:
if 'name' in eachline:
flag = True
else:
flag = False
except:
pass
for eachname in main_dict.keys():
for eachkey in header:
writer.write(str(main_dict[eachname][eachkey]))
writer.write (',')
if 'Hobbies' in eachkey:
writer.write ('\n')
CSV Output from Code above:
Expected CSV Output:
P.S: I can't seem to figure out how to not forgo the try/pass. As some lines (without '->') are unwanted, and I can't use the eachline.split("-> "). Would appreciate help on this too.
Thanks so much!
The code below gives the csv file which you can import in your Excel and it will be in exact format you are expecting.
You can use something like
if "->" not in line:
continue
To skip lines that don't contain "->" values, see in the code below:
import csv
file = open("student.txt", "r")
students = {}
name = ""
for line in file:
if "->" not in line:
continue
line = line.strip(",\n")
line = line.replace(" ", "")
key, value = line.split("->")
if key == "name":
name = value
students[name] = {}
students[name]["siblings"] = []
students[name]["hobbies"] = []
else:
if "sibling" in key:
students[name]["siblings"].append(value)
elif "hobby" in key:
students[name]["hobbies"].append(value)
#print(students)
csvlines = []
for student in students:
name = student
hobbies = students[name]["hobbies"]
siblings = students[name]["siblings"]
maxlength = 0
if len(hobbies) > len(siblings) :
maxlength = len(hobbies)
else:
maxlength = len(siblings)
if maxlength == 0:
csvlines.append([name, "N/A", "N/A"])
continue
for i in range(maxlength):
if i < len(siblings):
siblingvalue = siblings[i]
elif i == len(siblings):
siblingvalue = "N/A"
else:
siblingvalue = ""
if i < len(hobbies):
hobbyvalue = hobbies[i]
elif i == len(siblings):
hobbyvalue = "N/A"
else:
hobbyvalue = ""
if i == 0:
csvlines.append([name, siblingvalue, hobbyvalue])
else:
csvlines.append(["", siblingvalue, hobbyvalue])
print(csvlines)
fields = ["name", "siblings", "hobbies"]
with open("students.csv", 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
# writing the data rows
csvwriter.writerows(csvlines)
Here is my code:
inputFile = open("Employees.txt", "r").read()
inputList = inputFile.split("\n")
fList = []
def listString(s):
string = ""
return (string.join(s))
for i in inputList:
for x in i.split(","):
fList.append(x)
for y in range (len(fList)):
**if fList[y] == "90000":
fList[y] = str(90000 * 1.05) + "\n"
elif fList[y] == "75000":
fList[y] = str(75000 * 1.05) + "\n"
elif fList[y] == "110000":
fList[y] = str(110000 * 1.05) + "\n"
else:
fList[y] = fList[y] + ","**
print(listString(fList))
file = open("Emp_Bonus.txt", "a")
file.write(listString(fList))
Employees.txt contains the following:
Adam Lee,Programmer,90000
Morris Heather,DA,75000
John Lee,PM,110000
I am trying to get the following output:
Adam Lee,Programmer,94500
Morris Heather,DA,78750
John Lee,PM,115500
The part of the code that is in bold is the problem, The input salaries need to be able to be different values instead of the code only working for the sample input. The input salaries have to be multiplied by 1.05. How should I go about doing this? Thanks!
Another way without any library. Just read lines of the file as a list using readlines() and then iterate each line. Only modify the last part after splitting it using split(',') e.g salary of each line and finally create the new file as per the requirements.
multiply, final_result = 1.05, []
with open('Employees.txt', 'r') as f:
fList = f.readlines()
if fList:
for line in fList:
employee_info = line.split(',')
name = employee_info[0]
designation = employee_info[2]
salary = float(employee_info[2].replace('\n','').strip()) * multiply
final_result.append(f"{name},{employee_info[1]},{salary}")
if final_result:
with open('Emp_Bonus.txt', 'w') as f:
f.write('\n'.join(final_result))
Output:
Adam Lee,Programmer,94500.0
Morris Heather,DA,78750.0
John Lee,PM,115500.0
I will like to use Pandas:
import pandas as pd
df = pd.read_csv("Employees.txt",header=None)
df[2] = df.loc[df[2].isin([90000,75000,110000]),2]*1.05
df[2] = df[2].astype(int)
df.to_csv("Emp_Bonus.txt",mode="a",header=None)
I am trying to automate a functions based on what input is received but I'm getting an error when I try to pass the input as and arg for a function. Heres an example of what Im trying to do
var ='hello world'
def example(data):
#function code
example(var)
thats a basic usage of what Im doing and its returning an error like
var is not defined
here is my actual code
import AriaAudioConfig as Ariaconfig
import AriaMathModule as AriaMath
import AriaLocationModule as AriaLocation
import AriaNLPModule as AriaNLP
from inspect import getmembers, isfunction
import re
import pandas as pd
import csv
from typing import Awaitable, Callable, TypeVar
location = ['geolocatecity','citydiff','locate', 'location', 'where is', 'far', 'distance']
math = ['calculate', 'add', 'subtract', 'multiply', 'divide', 'addition', 'subtraction', 'multiplication', 'division', 'square-root', 'power', 'squared', 'minus']
audio = ['volume','speak', 'sound']
nlp = ['translate', 'translation', 'language', 'english', 'spanish', 'french']
locdict = {'geolocatecity':'blabla','citydiff':'blabla'}
state = 0
city2 = 0
file = pd.read_csv('geolocations.csv')
def dataProcess(data):
global state
global city2
datasearch = data.split()
argsearch = datasearch
datalength = len(datasearch)
for i in range(datalength):
if datasearch[i] in location:
data = datasearch[i]
datacom = typeremoval(functiongrep(AriaLocation))
datacom = str(datacom).split()
datalen = len(datacom)
with open('geolocations.csv', 'rt') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
for field in row[0]:
for i in range(datalength):
if argsearch[i] == row[0]:
try:
if city in locals():
city2 = argsearch[i]
except:
city = argsearch[i]
if argsearch[i] == row[1]:
state = argsearch[i]
if argsearch[i] == row[2]:
country = argsearch[i]
f.close()
for i in range(datalen):
if str(data) in str(datacom[i]):
activefunction = datacom[i]
if state != 0:
eval('AriaLocation.' + activefunction +'(' + city + ',' + state + ',' + country + ')')
elif city2 != 0:
eval('AriaLocation.' + activefunction + '(' + city + ',' + city2 + ')')
else:
print('uh-oh something went wrong')
elif datasearch[i] in math:
data = datasearch[i]
datacom = typeremoval(functiongrep(AriaMath))
print(data)
if data in datacom:
print('found')
elif datasearch[i] in audio:
data = datasearch[i]
datacom = typeremoval(functiongrep(Ariaconfig))
elif datasearch[i] in nlp:
data = datasearch[i]
datacom = typeremoval(functiongrep(AriaNLP))
#dataProcess('Aria how far am I from Arizona')
def functiongrep(function):
string = ''
functions_list = [o for o in getmembers(function) if isfunction(o[1])]
flen = len(functions_list)
for i in range(flen):
head, sep, tail = str(functions_list[i]).partition('<')
string = string + head
return string
def typeremoval(function):
func = str(function)
func = str(''.join(func))
func = re.sub("[',()]", '', func)
return func
dataProcess('locate Scottsdale Arizona USA')
I want dataProcess() to activate different commands based on what is given as the input.
Exception has occurred: NameError
name 'Scottsdale' is not defined
File "/Users/timyc1/Desktop/DeadIdeas/smartroom/Seavernet/Aria/AriaProcessingModule.py", line 58, in dataProcess
eval('AriaLocation.' + activefunction +'(' + city + ',' + state + ',' + country + ')')
File "/Users/timyc1/Desktop/DeadIdeas/smartroom/Seavernet/Aria/AriaProcessingModule.py", line 95, in <module>
dataProcess('locate Scottsdale Arizona USA')
Don't use eval for this. eval is almost never the solution.
if state != 0:
getattr(AriaLocation, activefunction)(city, state, country)
elif city2 != 0:
getattr(AriaLocation, activefunction)(city, cit2)
else:
print('uh-oh something went wrong')
I tried to solve this problem, but it's just not working. The code exports a .csv file that can be viewed fine in excel (but it skips a line every line..), but when i upload to tablepress or websimontables the code puts all of the rows in one cell instead of each in their own cell resulting in a table that looks off.
You can see an example of what i'm talking about here http://www.dustindavisyourrealtor.com/2015/08/06/85/. The top table is output from this script, the bottom table is an example of what i'm looking for.
Here is the link to the input .csv file
https://www.dropbox.com/s/figher1vgnlmwju/realtor.csv?dl=0
To run the script: python conv.py input.csv output.csv
import sys
import re
import csv
import urllib
CHARS_TO_REMOVE = (
0xa0, 0xae
)
RE1 = re.compile('^\d+$')
RE2 = re.compile('^\w{3}\s\d+\/\d+$')
FLAG1 = 'f1'
FLAG2 = 'f2'
URL_BASE = 'https://www.instantstreetview.com/s/%s'
EMAIL_LINK = '<a href=\'https://mail.google.com/mail/?view=cm&fs=1&tf=1&to=email#here.ca%20&su=Hello%20again\'>I want more information about this house</a>'
def cleanLine(line):
for ch in CHARS_TO_REMOVE:
line = line.replace(chr(ch), '')
return line.strip()
def checkRow(row):
m1 = RE1.match(row[0])
if m1:
return FLAG1
else:
m2 = RE2.match(row[0])
if m2:
return FLAG2
return None
def formatRow(row):
Addr= ' '.join(row['row1'][5:8])
Url = URL_BASE % urllib.quote_plus(Addr)
Row = {
#'Date' : row['row2'][0],
#'Time' : row['row2'][1],
'Date / Time' : ' '.join(row['row2'][0:2]),
'Type' : row['row1'][1],
'Beds' : row['row1'][2],
'Bath' : row['row1'][3],
'Address' : row['row1'][5] + ', ' + row['row1'][6],
'Listed At' : row['row1'][4],
'Area' : '<a href=\'%s\'>See the neighbourhood</a>' % Url,
'Get In Touch!' : EMAIL_LINK
}
return Row
def main(fnameIn, fnameOut):
Rows = []
row1 = None
row2 = None
for row in csv.reader(file(fnameIn).readlines()):
row = map(cleanLine, row)
flag = checkRow(row)
if flag == FLAG1:
row1 = row
row2 = None
elif flag == FLAG2:
row2 = row
if row1 and row2:
Rows.append(dict(row1=row1, row2=row2))
row1 = row2 = None
with open(fnameOut, 'w') as f:
fieldnames = ['Date / Time', 'Type', 'Beds', 'Bath', 'Address', 'Listed At', 'Area', 'Get In Touch!']
writer = csv.DictWriter(f, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
writer.writeheader()
for row in Rows:
writer.writerow(formatRow(row))
if __name__ == '__main__':
main(sys.arg[1], sys.argv[2])