I am trying to automate a functions based on what input is received but I'm getting an error when I try to pass the input as and arg for a function. Heres an example of what Im trying to do
var ='hello world'
def example(data):
#function code
example(var)
thats a basic usage of what Im doing and its returning an error like
var is not defined
here is my actual code
import AriaAudioConfig as Ariaconfig
import AriaMathModule as AriaMath
import AriaLocationModule as AriaLocation
import AriaNLPModule as AriaNLP
from inspect import getmembers, isfunction
import re
import pandas as pd
import csv
from typing import Awaitable, Callable, TypeVar
location = ['geolocatecity','citydiff','locate', 'location', 'where is', 'far', 'distance']
math = ['calculate', 'add', 'subtract', 'multiply', 'divide', 'addition', 'subtraction', 'multiplication', 'division', 'square-root', 'power', 'squared', 'minus']
audio = ['volume','speak', 'sound']
nlp = ['translate', 'translation', 'language', 'english', 'spanish', 'french']
locdict = {'geolocatecity':'blabla','citydiff':'blabla'}
state = 0
city2 = 0
file = pd.read_csv('geolocations.csv')
def dataProcess(data):
global state
global city2
datasearch = data.split()
argsearch = datasearch
datalength = len(datasearch)
for i in range(datalength):
if datasearch[i] in location:
data = datasearch[i]
datacom = typeremoval(functiongrep(AriaLocation))
datacom = str(datacom).split()
datalen = len(datacom)
with open('geolocations.csv', 'rt') as f:
reader = csv.reader(f, delimiter=',')
for row in reader:
for field in row[0]:
for i in range(datalength):
if argsearch[i] == row[0]:
try:
if city in locals():
city2 = argsearch[i]
except:
city = argsearch[i]
if argsearch[i] == row[1]:
state = argsearch[i]
if argsearch[i] == row[2]:
country = argsearch[i]
f.close()
for i in range(datalen):
if str(data) in str(datacom[i]):
activefunction = datacom[i]
if state != 0:
eval('AriaLocation.' + activefunction +'(' + city + ',' + state + ',' + country + ')')
elif city2 != 0:
eval('AriaLocation.' + activefunction + '(' + city + ',' + city2 + ')')
else:
print('uh-oh something went wrong')
elif datasearch[i] in math:
data = datasearch[i]
datacom = typeremoval(functiongrep(AriaMath))
print(data)
if data in datacom:
print('found')
elif datasearch[i] in audio:
data = datasearch[i]
datacom = typeremoval(functiongrep(Ariaconfig))
elif datasearch[i] in nlp:
data = datasearch[i]
datacom = typeremoval(functiongrep(AriaNLP))
#dataProcess('Aria how far am I from Arizona')
def functiongrep(function):
string = ''
functions_list = [o for o in getmembers(function) if isfunction(o[1])]
flen = len(functions_list)
for i in range(flen):
head, sep, tail = str(functions_list[i]).partition('<')
string = string + head
return string
def typeremoval(function):
func = str(function)
func = str(''.join(func))
func = re.sub("[',()]", '', func)
return func
dataProcess('locate Scottsdale Arizona USA')
I want dataProcess() to activate different commands based on what is given as the input.
Exception has occurred: NameError
name 'Scottsdale' is not defined
File "/Users/timyc1/Desktop/DeadIdeas/smartroom/Seavernet/Aria/AriaProcessingModule.py", line 58, in dataProcess
eval('AriaLocation.' + activefunction +'(' + city + ',' + state + ',' + country + ')')
File "/Users/timyc1/Desktop/DeadIdeas/smartroom/Seavernet/Aria/AriaProcessingModule.py", line 95, in <module>
dataProcess('locate Scottsdale Arizona USA')
Don't use eval for this. eval is almost never the solution.
if state != 0:
getattr(AriaLocation, activefunction)(city, state, country)
elif city2 != 0:
getattr(AriaLocation, activefunction)(city, cit2)
else:
print('uh-oh something went wrong')
Related
I inherited a bit of Python code and I have no background. I am getting unbound local error and is probably something really silly on my part.
UnboundLocalError Traceback (most recent call last)
Input In [1], in <cell line: 372>()
368 print('\n----------------------------------------------------------------------\n')
369 ##############################################################################
--> 372 main()
Input In [1], in main()
319 Gender = p.getGender()
320 StateRes = p.getStateRes()
--> 321 children = immunte(Fname, Lname, DOB, Gender, driver)
323 if children == []:
324 not_found += 1
Input In [1], in immunte(Fname, Lname, DOB, Gender, driver)
204 except WebDriverException:
205 al = []
--> 207 return al
UnboundLocalError: local variable 'al' referenced before assignment
I have looked at this for a few days now and I can't seem to find an answer to this problem even though it is likely simple. It seems a solution to this error is a global keyword somewhere but I am not sure if this applies here as every time I tried to apply global to al = [] i got an error or same result. Any help is appreciated, thank you.
# Imports
import csv
import datetime
import os
import os.path
import time
import pandas as pd
from dateutil.parser import parse
from pandas import DataFrame
from selenium import webdriver
from selenium.common.exceptions import (NoSuchElementException,
WebDriverException)
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
##############################################################################
# Classes
class Person(object):
def __init__(self, measYr, MEMID, MIDSK, fname, lname, LNMSFX, DRB, GDR, STRES, meas):
self.measYr = measYr
self.MEMID = MEMID
self.MIDSK = MIDSK
self.fname = fname
self.lname = lname
self.LNMSFX = LNMSFX
self.DRB = DRB
self.GDR = GDR
self.STRES = STRES
self.meas = meas
def GTMESYR(self):
return self.measYr
def GTMEMSKY(self):
return self.MIDSK
def GTMEMID(self):
return self.MEMID
def GTFSNM(self):
return self.fname
def GTLSNM(self):
return self.lname
def GTLSTNMSF(self):
return self.LNMSFX
def GTDRB(self):
return self.DRB
def GTGDR(self):
return self.GDR
def getStateRes(self):
return self.STRES
def getMeas(self):
return self.meas
###############################################################################
# Function
def is_date(string, fuzzy=False):
try:
parse(string, fuzzy=fuzzy)
return True
except ValueError:
return False
def immunte(Fname, Lname, DRB, GDR, driver):
# work on search button
driver.find_element_by_xpath("//*[#id='edittesttest']").click()
# work on
lastname = driver.find_element_by_id("LM")
lastname.clear()
lastname.send_keys(Lname)
# work on
firstname = driver.find_element_by_id("FN")
firstname.clear()
firstname.send_keys(Fname)
# work on
birthdate = driver.find_element_by_id("DRB")
birthdate.clear()
birthdate.send_keys(DRB)
# work on advanced search button to input GDR
try:
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td[5]/input").click()
# work on GDR selection button
obj = Select(driver.find_element_by_name("OSC"))
if GDR == 'W':
obj.select_by_index(2)
elif GDR == 'S':
obj.select_by_index(1)
else:
obj.select_by_index(3)
# work on search button
driver.find_element_by_name("cmdFindClient").click()
# two scenarios could emerge as a search result: 1, not found 2, the found
if "No were found for the requested search criteria" in driver.find_element_by_id("queryResultsForm").text:
al = []
elif "the found" in driver.find_element_by_id("queryResultsForm").text:
# work on button
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table[2]/tbody/tr[2]/td[2]/span/label").click()
# work on pt button
driver.find_element_by_id("redirect1").click()
# work on getting rid of opt out - header
header = driver.find_elements_by_class_name("large")[1].text
if "Access Restricted" in header:
print(Fname+' '+Lname+' '+" Opt out")
al = []
elif "Information" in header:
# find the first line
first = driver.find_element_by_xpath(
"//*[#id='container']/table[3]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[5]/td[1]").text
if (first == None):
al = []
else:
even = driver.find_elements_by_class_name("evenRow")
odd = driver.find_elements_by_class_name("oddRow")
o = []
e = []
for value in odd:
o.append(value.text)
for value in even:
e.append(value.text)
length = len(o)
i = 0
al = []
# merge odd and even row together and remove the row marked with complete
while i < length:
al.append(e[i])
al.append(o[i])
i = i+1
# parse each row of information with a comma, add group name for row that are without one
for x in range(len(al)):
if is_date(al[x][1:10]):
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
al[x] = group + ',' + al[x][2:]
else:
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
g = al[x].split(',', 1)
group = g[0]
# work on returning to home page
driver.find_element_by_xpath(
"//*[#id='headerMenu']/table/tbody/tr/td[2]/div/a").click()
except NoSuchElementException:
al = []
except WebDriverException:
al = []
return al
def main():
# Welcome message and input info
print('\nThis is the test.')
print('You will be prompted to type .')
print('If you need to exit the script and stop its process press \'CTRL\' + \'C\'.')
file = input("\nEnter file name: ")
user = input("\nEnter username: ")
pw = input("\nEnter password: ")
date = str(datetime.date.today())
# output file
fileOutputName = 'FILELIST' + \
date.replace('-', '_') + '.csv'
fileOutputNameNotFound = 'NOTFOUNDFILELIST' + \
date.replace('-', '_') + '.csv'
fileOutput = open(fileOutputName, 'w')
fileOutputNotFound = open(fileOutputNameNotFound, 'w')
fileOutput.write('MEAS_YR,MEMLFIDSK,MEMLFID,MEMB_FRST_NM,MEMLSTNM,' +
'DRB,GNDR,RSDNC_STATE,IMUN_RGSTRY_STATE,VCCN_GRP,VCCN_ADMN_DT,DOSE_SERIES,' +
'BRND_NM,DOSE_SIZE,RCTN\n')
fileOutputNotFound.write('MEAS_YR,MEMLFIDSK,MEMLFID,MEMB_FRST_NM,MEMLSTNM,MEMB_SUFFIX,' +
'DRB,GNDR,RSDNC_STATE,IMUN_RGSTRY_STATE,VCCN_GRP,VCCN_ADMN_DT,DOSE_SERIES,' +
'BRND_NM,DOSE_SIZE,RCTN\n')
# If the file exists
try:
os.path.isfile(file)
except:
print('File Not Found\n')
df = pd.read_excel(file)
# create array of People objects and member ID
peopleArray = []
memberIdArray = []
df.dropna()
total = len(df)
not_found = 0
found = 0
# assign each record in the data frame into Person class
for i in range(total):
measYr = str(df.loc[i, "MEAS_YR"])
MEMID = str(df.loc[i, "MEMLFID"])
MIDSK = str(df.loc[i, "MEMLFIDSK"])
fname = str(df.loc[i, "MEMLFID"])
lname = str(df.loc[i, "MEMLSTNM"])
inputDate = str(df.loc[i, "DRB"])
# If date is null then assign an impossible date
if not inputDate:
DRB = '01/01/1900'
if '-' in inputDate:
DRB = datetime.datetime.strptime(
inputDate, "%Y-%m-%d %H:%M:%S").strftime('%m/%d/%Y')
else:
DRB = datetime.datetime.strptime(
str(df.loc[i, "DRB"]), '%m/%d/%Y').strftime('%m/%d/%Y')
GDR = str(df.loc[i, "GDR"])
STRES = str(df.loc[i, "STATE_RES"])
meas = str(df.loc[i, "MEAS"])
p = Person(measYr, MEMID, MIDSK, fname, lname,
LNMSFX, DRB, GDR, STRES, meas)
# append array
m = df.loc[i, "MEMLFID"]
if (m not in memberIdArray):
peopleArray.append(p)
memberIdArray.append(m)
# work on setting up driver for md immunet - mac forward slash/windows double backward slash
PATH = os.getcwd()+'\\'+'chromedriver'
s = Service(PATH)
driver = webdriver.Chrome(service = s)
driver.get("https://www.wow2.pe.org/prd-IR/portalmanager.do")
# work on login ID
username = driver.find_element_by_id("userField")
username.clear()
username.send_keys(user)
# work on password
password = driver.find_element_by_name("password")
password.clear()
password.send_keys(pw)
# work on getting to home page - where loop will start
driver.find_element_by_xpath(
"//*[#id='loginButtonForm']/div/div/table/tbody/tr[3]/td[1]/input").click()
for n in range(total):
p = peopleArray[n]
recordToWrite = ''
print('Looking up: ' + str(n)+' ' +
p.GTLSNM() + ', ' + p.GTFSNM())
MeasYr = p.GTMESYR()
MIDSK = p.GTMEMSKY()
MEMID = p.GTMEMID()
Fname = p.GTFSNM()
Lname = p.GTLSNM()
DRB = str(p.GTDRB())
GDR = p.GTGDR()
STRES = p.getStateRes()
children = immunte(Fname, Lname, DRB, GDR, driver)
if children == []:
not_found += 1
recordToWrite = MeasYr+','+MIDSK+','+MEMID+',' + Fname + \
','+Lname + ',' + ' ' + ','+DRB+','+GDR+','+STRES+','+'MD'
fileOutputNotFound.write(recordToWrite + '\n')
elif children != []:
found += 1
for x in range(len(children)):
data_element = children[x].split(",")
# if the admin date is not valid
if is_date(data_element[1]) and is_date(data_element[3]):
children[x] = ''
elif is_date(data_element[1]) and data_element[2] == 'NOT' and data_element[3] == 'VALID':
children[x] = ''
elif is_date(data_element[1]) and is_date(data_element[3]) == False:
if data_element[5] != 'No':
data_element[4] = data_element[5]
data_element[5] = ''
children[x] = ','.join(data_element[0:6])
else:
data_element[5] = ''
children[x] = ','.join(data_element[0:6])
else:
children[x] = ''
for x in range(len(children)):
if children[x] != '':
recordToWrite = MeasYr+','+MIDSK+','+MEMID+',' + \
Fname+','+Lname + ','+DRB+','+GDR+','+STRES+','+'MD'
recordToWrite = recordToWrite+','+children[x]
fileOutput.write(recordToWrite + '\n')
n = +1
fileOutput.close()
fileOutputNotFound.close()
print('\n--------------------------------OUTPUT--------------------------------')
print("Script completed.")
##############################################################################
main()
You can try this in the 'immunte' function:
def immunte(Fname, Lname, DRB, GDR, driver):
al = []
# work on search button
driver.find_element_by_xpath("//*[#id='edittesttest']").click()
# work on
lastname = driver.find_element_by_id("LM")
lastname.clear()
lastname.send_keys(Lname)
# work on
firstname = driver.find_element_by_id("FN")
firstname.clear()
firstname.send_keys(Fname)
# work on
birthdate = driver.find_element_by_id("DRB")
birthdate.clear()
birthdate.send_keys(DRB)
# work on advanced search button to input GDR
try:
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td[5]/input").click()
# work on GDR selection button
obj = Select(driver.find_element_by_name("OSC"))
if GDR == 'W':
obj.select_by_index(2)
elif GDR == 'S':
obj.select_by_index(1)
else:
obj.select_by_index(3)
# work on search button
driver.find_element_by_name("cmdFindClient").click()
# two scenarios could emerge as a search result: 1, not found 2, the found
if "No were found for the requested search criteria" in driver.find_element_by_id("queryResultsForm").text:
return al
if "the found" in driver.find_element_by_id("queryResultsForm").text:
# work on button
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table[2]/tbody/tr[2]/td[2]/span/label").click()
# work on pt button
driver.find_element_by_id("redirect1").click()
# work on getting rid of opt out - header
header = driver.find_elements_by_class_name("large")[1].text
if "Access Restricted" in header:
print(Fname+' '+Lname+' '+" Opt out")
return al
if "Information" in header:
# find the first line
first = driver.find_element_by_xpath(
"//*[#id='container']/table[3]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[5]/td[1]"
).text
if (first == None):
return al
even = driver.find_elements_by_class_name("evenRow")
odd = driver.find_elements_by_class_name("oddRow")
o = []
e = []
for value in odd:
o.append(value.text)
for value in even:
e.append(value.text)
length = len(o)
i = 0
al = []
# merge odd and even row together and remove the row marked with complete
while i < length:
al.append(e[i])
al.append(o[i])
i = i+1
# parse each row of information with a comma, add group name for row that are without one
for x in range(len(al)):
if is_date(al[x][1:10]):
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
al[x] = group + ',' + al[x][2:]
else:
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
g = al[x].split(',', 1)
group = g[0]
# work on returning to home page
driver.find_element_by_xpath(
"//*[#id='headerMenu']/table/tbody/tr/td[2]/div/a").click()
except NoSuchElementException:
pass
except WebDriverException:
pass
return al
I'm not sure where I'm going wrong here and why my data is returning wrong. Writing this code to use fuzzywuzzy to clean bad input road names against a list of correct names, replacing the incorrect with the closest match.
It's returning all lines of data2 back. I'm looking for it to return the same, or replaced lines of data1 back to me.
My Minimal, Reproducible Example:
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
data1 =('3176 DETRIT ROAD')
data2 =('DETROIT RD')
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data2 = data2.split(',')
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2[0])
My full code
import pandas as pd
import os
import csv
import usaddress
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
def convert_tolist(string):
li = list(string.split(" "))
return li
with open(r"Cass_Howard - Copy.csv") as csv_file,\
open("Final_Test_Clean.csv", "w") as f,\
open(r"TEST_no_dups12.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader = csv.reader(f, delimiter=',')
file_1 = csv_file
file_2 = ul
for data1, data2 in zip(file_1, file_2):
data1 = data1.split(',')
data1 = data1[18]
data1 = data1.upper()
data2 = data2.strip()
data2 = data2.split(',')
data2 = ''.join(data2)
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
roaddnum2 = data1[0].get('AddressNumber', '')
roadir2 = data1[0].get('StreetNamePreDirectional', '')
roadname2 = data1[0].get('StreetName', '')
roaddsg2 = data1[0].get('StreetNamePostType', '')
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
data1 = list(data1)
convert_tolist(data2)
if street2 not in data2:
street2 = process.extract(street2, data2)
print(street2)
street2 query data(around 950 lines)
DETROIT ROAD
DETROIT ROAD
MANNIX ST
MANNIX ST
data2 choices data(around 200 lines)
ACRES
ADERSON RD
AIRPORT RD
ALGONQUIN
Okay, I'm not certain I've fully understood your issue, but modifying your reprex, I have produced the following solution.
import usaddress
from fuzzywuzzy import process
data1 = "3176 DETRIT ROAD"
choices = ["DETROIT RD"]
try:
data1 = usaddress.tag(data1)
except usaddress.RepeatedLabelError:
pass
parts = [
data1[0].get("StreetNamePreDirectional"),
data1[0].get("StreetName"),
data1[0].get("StreetNamePostType"),
]
street = " ".join([x for x in parts if x])
if street not in choices:
street = " ".join(
[data1[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
print(street)
This yields:
3176 DETROIT RD
So basically, it has replaces the street name bits with the best match from the choices list using the process.extract() function.
And here it is as a callable function:
import usaddress
from fuzzywuzzy import process
def best_street(addr: str, choices: list[str]) -> str:
try:
usaddr = usaddress.tag(addr)
except usaddress.RepeatedLabelError:
pass
street_parts = [
usaddr[0].get("StreetNamePreDirectional"),
usaddr[0].get("StreetName"),
usaddr[0].get("StreetNamePostType"),
]
street = " ".join([x for x in street_parts if x])
return " ".join(
[usaddr[0].get("AddressNumber"), process.extract(street, choices)[0][0]]
)
if __name__ == "__main__":
choices = ["AIRPORT RD", "DETROIT RD"]
print(best_street("123 Detrt", choices))
print(best_street("9876 AIRPUMP DR", choices))
yielding:
123 DETROIT RD
9876 AIRPORT RD
My issue here was fuzzywuzzy requires you to pass an iterable so I had to add data2 = data2.split(',') to get the full strings to return.
My other issue was thinking I needed to use zip() to compare my files when zip() is for comparing parallel and not cross products.
Here is what I came up with that works. There is no issue with the code but fuzzywuzzy is not accurate enough for this tool to be practical to clean my data with the amount of typos in my address data.
If you can think of a way for me to clean up the if/else statement i'd be willing to hear it.
import os
import csv
import shutil
import usaddress
import pandas as pd
from fuzzywuzzy import process
with open(r"TEST_Cass_Howard.csv") as csv_file, \
open(".\Scratch\Final_Test_Clean.csv", "w") as f, \
open(r"TEST_Unique_List.csv") as ul:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader)
csv_reader1 = csv.reader(f, delimiter=',')
correct = list(ul)
for line in csv_reader:
line = line[18]
line = line.upper()
if line == '' or line == ' ':
line = ''
else:
try:
addressbrk = usaddress.tag(line)
except usaddress.RepeatedLabelError:
addressbrk = line
line = addressbrk # alt output: ('Please fix the incorect format of: %s,' % addressbrk)
if line != '':
roadnum2 = line[0].get('AddressNumber', '')
roadir2 = line[0].get('StreetNamePreDirectional', '')
roadname2 = line[0].get('StreetName', '')
roaddsg2 = line[0].get('StreetNamePostType', '')
else:
line = ''
if line != '':
street2 = (roadir2, roadname2, roaddsg2)
street2 = " ".join(street2)
street2 = street2.strip()
else:
street2 = ''
if street2 != '':
if street2 not in correct:
street2 = process.extractOne(street2, correct)
else:
street2 = '\n'
if street2 != '':
tgthr = (roadnum2, street2[0])
else:
tgthr = ''
if tgthr != '':
final = (' '.join(tgthr))
else:
final = ('Null' + '\n')
f.writelines(final)
original = r"TEST_Cass_Howard.csv"
target = (r'.\Scratch\Cass_Howard_Clean.csv')
shutil.copyfile(original, target)
df1 = pd.read_csv('.\Scratch\Final_Test_Clean.csv', header=None)
df1.columns = ["WELL_ADDR_CLN"]
df = pd.read_csv('.\Scratch\Cass_Howard_Clean.csv')
df = df.join(df1)
new_data = df['WELL_ADDR_CLN']
df = df.drop(columns=['WELL_ADDR_CLN'])
df.insert(loc=19, column='WELL_ADDR_CLN', value=new_data)
os.remove(".\Scratch\Cass_Howard_Clean.csv")
df.to_csv("Cass_Howard_Clean.csv", index=False)
os.remove('.\Scratch\Final_Test_Clean.csv')
I have a text file, 'student.txt'. Some keys have multiple values. I only want data that is tied to the name, and the sibling & hobby values below that name.
'student.txt'
ignore me
name-> Alice
name-> Sam
sibling-> Kate,
unwanted
sibling-> Luke,
hobby_1-> football
hobby_2-> games
name-> Ramsay
hobby_1-> dance
unwanted data
hobby_2-> swimming
hobby_3-> jogging
ignore data
Code I've done:
file = open("student.txt", "r")
with open("student.csv", "w") as writer:
main_dict = {}
student_dict = {"Siblings": "N/A", "Hobbies": "N/A"}
sibling_list = []
hobby_list = []
flag = True
writer.write ('name,siblings,hobbies\n')
header = 'Name,Siblings,Hobbies'.split(',')
sib_str = ''
hob_str =''
for eachline in file:
try:
key, value = eachline.split("-> ")
value = value.strip(",\n")
if flag:
if key == "name":
print (key,value)
if len(sibling_list) > 0:
main_dict[name]["Siblings"] = sib_str
#print (main_dict)
if len(hobby_list) > 0:
main_dict[name]["Hobbies"] = hob_str
sibling_list = []
hobby_list = []
name = value
main_dict[name] = student_dict.copy()
main_dict[name]["Name"] = name
elif key == "sibling":
sibling_list.append(value)
sib_str= ' '.join(sibling_list).replace(' ', '\n')
elif key.startswith("hobby"):
hobby_list.append(value)
hob_str = ' '.join(hobby_list)
if len(sibling_list) > 0:
main_dict[name]["Siblings"] = sib_str
if len(hobby_list) > 0:
main_dict[name]["Hobbies"] = hob_str
if 'name' in eachline:
if 'name' in eachline:
flag = True
else:
flag = False
except:
pass
for eachname in main_dict.keys():
for eachkey in header:
writer.write(str(main_dict[eachname][eachkey]))
writer.write (',')
if 'Hobbies' in eachkey:
writer.write ('\n')
CSV Output from Code above:
Expected CSV Output:
P.S: I can't seem to figure out how to not forgo the try/pass. As some lines (without '->') are unwanted, and I can't use the eachline.split("-> "). Would appreciate help on this too.
Thanks so much!
The code below gives the csv file which you can import in your Excel and it will be in exact format you are expecting.
You can use something like
if "->" not in line:
continue
To skip lines that don't contain "->" values, see in the code below:
import csv
file = open("student.txt", "r")
students = {}
name = ""
for line in file:
if "->" not in line:
continue
line = line.strip(",\n")
line = line.replace(" ", "")
key, value = line.split("->")
if key == "name":
name = value
students[name] = {}
students[name]["siblings"] = []
students[name]["hobbies"] = []
else:
if "sibling" in key:
students[name]["siblings"].append(value)
elif "hobby" in key:
students[name]["hobbies"].append(value)
#print(students)
csvlines = []
for student in students:
name = student
hobbies = students[name]["hobbies"]
siblings = students[name]["siblings"]
maxlength = 0
if len(hobbies) > len(siblings) :
maxlength = len(hobbies)
else:
maxlength = len(siblings)
if maxlength == 0:
csvlines.append([name, "N/A", "N/A"])
continue
for i in range(maxlength):
if i < len(siblings):
siblingvalue = siblings[i]
elif i == len(siblings):
siblingvalue = "N/A"
else:
siblingvalue = ""
if i < len(hobbies):
hobbyvalue = hobbies[i]
elif i == len(siblings):
hobbyvalue = "N/A"
else:
hobbyvalue = ""
if i == 0:
csvlines.append([name, siblingvalue, hobbyvalue])
else:
csvlines.append(["", siblingvalue, hobbyvalue])
print(csvlines)
fields = ["name", "siblings", "hobbies"]
with open("students.csv", 'w') as csvfile:
# creating a csv writer object
csvwriter = csv.writer(csvfile)
# writing the fields
csvwriter.writerow(fields)
# writing the data rows
csvwriter.writerows(csvlines)
I am trying to get data from a txt file and put it into a dataframe. Text file looks something like this:
******************************************************************************************************************************
DATE BUY:2018/05/26
****************************************************************************************************************************
STORE: DUBIDUBI SAILOR: 123456
***********************************************************************************************************************
< CLIENT >
NAME CLIENT MEMBER TYPE MEMBER NUMBER: 89101112
ANTONY STARK 1
<PRODUCTS>
NUM-PRODUCTS
6
< ADDRESS TO SEND>
186 FLEET STREET
-----------------------------------------------------------------------------------------------------------------------
< CLIENT >
NAME CLIENT MEMBER TYPE MEMBER NUMBER: 13141516
THOR 2
<PRODUCTS>
NUM-PRODUCTS
2
< ADDRESS TO SEND>
1800 PENNSYLVANIA STREET
<SERVICES>
NUM-SERVICE TYPE OF SERVICE
64 DEVOLUTION
*****************************************************************************************************************************
I want to get a dataframe containing a list of all the clients information that were assisted by same sailor in same store.
What does works: below code reads the text line by line and extract the information of each line.
data []
global STORE, DATE_BUY, SAILOR, CLIENT, MEMBER_NUM, NUM_PRODUCTS, ADDRESS, NUM_SERVICE, TYPE_MEMB, TYPE_SERV
STORE = ""
DATE_BUY = ""
SAILOR = ""
CLIENT = ""
MEMBER_NUM = ""
NUM_PRODUCTS = ""
ADDRESS = ""
NUM_SERVICE = ""
TYPE_MEMB = ""
TYPE_SERV = ""
with open ('myfile', 'r') as txt_file:
read_file = txt_file.readlines()
for i in range(0, len(read_file)):
line = read_file[i]
z = line[0:50]
a = line[0:9]
b = line[0:42]
c = line[112:132]
d = line[0:14]
e = line[0:14]
dif_client = line[0:58]
if a == " STORE":
STORE = line[10:28]
SAILOR = line[30:45]
elif c == " DATE BUY":
DATE_BUY = line[133:145]
elif b == " NAME CLIENT"
nextline = read_file[i + 1]
CLIENT = nextline[0:57]
MEMBER_NUM = nextline[96:126]
TYPE_MEMB = nextline[79:86]
elif d == " < ADDRESS":
nextline = read_file[i + 1]
ADDRESS = nextline[0:63]
elif e == " < PRODUCTS":
nextline = read_file[i + 1]
NUM_PRODUCTS = nextline[0:24]
elif f == " <SERVICES":
nextline = read_file[i + 1]
NUM_SERVICE = nextline[]
TYPE_SERV = nextline[]
data.append({'Store':STORE, 'Sailor':SAILOR, 'Date_Buy':DATE_BUY, 'Client':CLIENT, 'Member_Num':MEMBER_NUM,
'Type_Memb':TYPE_MEMB, 'Address':ADDRESS, 'Products':NUM_PRODUCTS,'Num_Serv':NUM_SERVICE, 'Type_Serv':TYPE_SERV})
df = pd.DataFrame(data)
What does NOT works: when using a nested while loop to extract information of each client assisted by a sailor my code simply does not end running. The code that does not work is:
data []
global STORE, DATE_BUY, SAILOR, CLIENT, MEMBER_NUM, NUM_PRODUCTS, ADDRESS, NUM_SERVICE, TYPE_MEMB, TYPE_SERV
STORE = ""
DATE_BUY = ""
SAILOR = ""
CLIENT = ""
MEMBER_NUM = ""
NUM_PRODUCTS = ""
ADDRESS = ""
NUM_SERVICE = ""
TYPE_MEMB = ""
TYPE_SERV = ""
with open ('myfile', 'r') as txt_file:
read_file = txt_file.readlines()
for i in range(0, len(read_file)):
line = read_file[i]
z = line[0:50]
a = line[0:9]
b = line[0:42]
c = line[112:132]
d = line[0:14]
e = line[0:14]
dif_client = line[0:58]
while dif_client != " < CLIENT >":
if a == " STORE":
STORE = line[10:28]
SAILOR = line[30:45]
elif c == " DATE BUY":
DATE_BUY = line[133:145]
elif b == " NAME CLIENT"
nextline = read_file[i + 1]
CLIENT = nextline[0:57]
MEMBER_NUM = nextline[96:126]
TYPE_MEMB = nextline[79:86]
elif d == " < ADDRESS":
nextline = read_file[i + 1]
ADDRESS = nextline[0:63]
elif e == " < PRODUCTS":
nextline = read_file[i + 1]
NUM_PRODUCTS = nextline[0:24]
elif f == " <SERVICES":
nextline = read_file[i + 1]
NUM_SERVICE = nextline[]
TYPE_SERV = nextline[]
data.append({'Store':STORE, 'Sailor':SAILOR, 'Date_Buy':DATE_BUY, 'Client':CLIENT, 'Member_Num':MEMBER_NUM,
'Type_Memb':TYPE_MEMB, 'Address':ADDRESS, 'Products':NUM_PRODUCTS,'Num_Serv':NUM_SERVICE, 'Type_Serv':TYPE_SERV})
df = pd.DataFrame(data)
The desired output should look something like this. I know that each client information comes when the word < CLIENT > appears in text.
Date_buy Store Sailor Client Member_Number Num_Products Address_to_send num_Service type_serv
2018/05/26 dubidubi 123456 ANTONY STARK 89101112 6 186 FLEET STREET
2018/05/26 dubidubi 123456 THOR 13141516 2 1800 PENNSYLVANIA STREET 64 DEVOLUTION
Thanks for the description. The problem is in the infinite loop you built:
dif_client = line[0:58]
while dif_client != " < CLIENT >":
if a == " STORE":
...
dif_client doesn't change within the loop. There is no break or other exit, only the while condition (which is good design). Therefore, once you get into the loop, you have no way to leave: dif_client is constant.
Your logic is incorrect: you have two loops that are trying to walk through the lines of the file:
for i in range(0, len(read_file)):
...
while dif_client != " < CLIENT >":
# Process one line
The body of the while is designed to process one line. When you're done with that, you need to go to the next iteration of the for to get the next line. Finding a CLIENT line is a if decision, not a loop.
The Code Below I wrote takes input from a sample file which contains First and Last names. Then it converts those names to sample emails. For some reason the Script keeps printing the same Last name over and over.
namess.txt looks like this:
firstname,lastname
CODE:
import os, re, time, getpass, linecache
Original = os.path.join(os.path.expanduser('~'), 'Desktop','namess.txt')
File = os.path.join(os.path.expanduser('~'), 'Desktop','output.txt')
badNames = []
Names = []
def RemCommas():
outfile = open(os.path.join('C:\\', 'Users', getpass.getuser(), 'Desktop','output.txt'),'w')
Filedata = open(Original).read()
outfile.write(re.sub(',', ' ', Filedata))
outfile.close()
def ClassNum():
count = 6
Year = int(time.strftime('%Y'))
Class = str((Year - 2013) + 6)
return Class
def ReadStoreFile():
i = 0
OpenFile = open(File)
LenFile = len(OpenFile.readlines())
while i < LenFile:
i += 1
badNames.append(linecache.getline(File, i))
def CleanNames():
i = 0
while i < len(badNames):
cleaned = badNames[i].rstrip()
Names.append(cleaned)
i += 1
def NamePrint():
Interns = 'makchessclub.org'
arrayname = []
i = 0
j = 0
m = 0
while m < len(Names):
Name = Names[m]
Name = Name.lower()
InternName = Name[0] + Name[1]
#------------Checking for space and first name--
while i < len(Name):
if Name[i] == ' ':
i = Name.index(' ')
break;
i += 1
#---------------adding last name in an array----
Namelen = len(Name) - (i+1)
while j < Namelen:
arrayname.append(Name[i+1])
j += 1
i += 1
#---------------Final Name Print----------------
Lastname = ''.join(arrayname)
#print arrayname
#Lastname = Lastname.strip(' ')
#print InternName + Lastname + ClassNum() + Interns
file = open('C:\\Users\\username\\Desktop\\emails.txt', 'a')
file.write(InternName + Lastname + ClassNum() + Interns + '\n')
file.close()
m += 1
RemCommas()
ReadStoreFile()
CleanNames()
NamePrint()
print ''
os.system('pause')
The reason the last name doesn't change is because you are not resetting arrayname in your loop. You keep appending names to it, and the program picks the first one. So you should put your arrayname = [] after the while m < len(Names):
I guess this what you are trying to do:
import os
import re
import time
def create_mails(input_path, output_path, year, addr):
with open(input_path, 'r') as data:
mail = re.sub(r'(\w+)\s*,\s*(\w+)\n?', r'\1\g<2>%s%s\n' % (year, addr), data.read())
with open(output_path, 'w') as output:
output.write(mail.lower())
print 'Mail addresses generated and saved to', output_path
Demo:
create_mails(
os.path.join(os.path.expanduser('~'), 'Desktop', 'namess.txt'),
os.path.join(os.path.expanduser('~'), 'Desktop', 'output.txt'),
str(int(time.strftime('%Y')) - 2013 + 6),
'#makchessclub.org'
)
If namess.txt is something like this:
First, Last
John,Doe
Spam, Ham
Cabbage, egg
Then output.txt is going to be like this:
firstlast6#makchessclub.org
johndoe6#makchessclub.org
spamham6#makchessclub.org
cabbageegg6#makchessclub.org