Write a List Into Excel File with Python - python

I have a list to be exported to an Excel file keeping the appropriate format, I resorted to a library named xlsxwriter,
here is an example :
xlsxwriter
and here is my list :
{'FirstName': u'Forence','LastName': u'Bidorst', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
{'FirstName': u'Oliver','LastName': u'Bidorst', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
{'FirstName': u'Mathew','LastName': u'Stark', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
{'FirstName': u'Sphed','LastName': u'liomst', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
I modified the code to browse a list and insert it into the file,
def export_result_XL():
list=get_list()
...
# Write some data headers.
worksheet.write('A1', 'First Name', bold)
worksheet.write('B1', 'Last Name', bold)
worksheet.write('C1', 'Salary', bold)
worksheet.write('D1', 'Birth Date', bold)
# Some data we want to write to the worksheet.
for entry in list:
x = str(entry['FirstName'])
y = str(entry['LastName'])
z = str(entry['Salary'])
e = str(entry['BirthDate'])
v = BirthDate[:10] # because the date format is like yyyy-mm-dd 00:00:00
expenses = (
[x,y ,z ,v]
)
# Start from the first cell below the headers.
row = 1
col = 0
for item ,str1,str2,str3,date_str in (expenses):
# Convert the date string into a datetime object.
date = datetime.strptime(date_str, "%Y-%m-%d")
worksheet.write_string (row, col, str1 )
worksheet.write_string (row, col + 1, str2 )
worksheet.write_string(row, col + 2, str3 )
worksheet.write_datetime(row, col + 3, date, date_format )
row += 1
# Write a total using a formula.
#worksheet.write(row, 0, 'Total', bold)
#worksheet.write(row, 2, '=SUM(C2:C5)', money_format)
workbook.close()
return ''
I had two problems here :
1 -
for item, date_str in (frais)
ValueError: too many values ​​to unpack
2-
if I avoid to convert to date format the file will be genreated but columns and rows are flipped
Is there any Idea how to do it , I hope I was clear in the description

Finally I found a Solution :
row = 1
col = 0
for entry in list:
print entry
strdate=str(entry['BirthDate'])
formatdate=strdate[:10]
date = datetime.strptime(str(formatdate), "%Y-%m-%d")
worksheet.write_string (row, col, entry['FirstName'] )
worksheet.write_string (row, col+1, entry['LastName'] )
worksheet.write_number (row, col+6, entry['Salary'],number_format )
worksheet.write_datetime(row, col+10, date, date_format )
row += 1
workbook.close()

Related

How create dataframe from list of dictionary of multi level json

So I have a json file that have multiple level,by using pandas I can read the first level and put it in a dataframe,but the problem is as you can see in the dataframe Column Comments and hastags the second level is inside a column have format of list of dictionary,is there any solution for make the second level dictionary into dataframe. I try to use for loop and json_normalize but it always throw an error. Any suggestion? My code is like this
import pandas as pd
df2 = pd.read_json("data.json")
cid = []
for x in df2["comments"]:
cid.append(x.get('cid'))
data = pd.DataFrame({'cid':cid})
If i use the code it throw an error since I try access list by string not index.
AttributeError: 'list' object has no attribute 'get'
Even I change it into integer it got dictionary inside a column.How to change the dict inside the column or is there another easier way to do this? Dictionary in column
for x in df2["comments"]:
cid.append(x[0])
data = pd.DataFrame({'cid':cid})
for y in data:
print(y.get('cid'))
Example of first row of the data frame
[{'cid': '7000061798167266075', 'createTime': 1629828926, 'text': 'Done 🥰', 'diggCount': 1, 'replyCommentTotal': 0, 'uid': '6529379534374092801', 'uniqueId': 'alikayanti'}, {'cid': '6999869922566783771', 'createTime': 1629784228, 'text': 'haloo min, yg udah ikutan di misi sebelumnya boleh ikutan lagi gaa?', 'diggCount': 1, 'replyCommentTotal': 1, 'uid': '6842932775562642433', 'uniqueId': 'fia_654'}, {'cid': '7000248857603588891', 'createTime': 1629872457, 'text': 'bell bottoms maksudnya apa kak?\napakah benar artinya bel bawah?', 'diggCount': 0, 'replyCommentTotal': 2, 'uid': '6960940768727417857', 'uniqueId': 'keterimadiptn1'}, {'cid': '7000322023545455387', 'createTime': 1629889491, 'text': 'syudah🥰', 'diggCount': 0, 'replyCommentTotal': 0, 'uid': '6806645499672839170', 'uniqueId': 'miftahulhaqqu'}, {'cid': '7001271117180977947', 'createTime': 1630110475, 'text': 'kak, perpanjang dong waktu posting videonya :)', 'diggCount': 1, 'replyCommentTotal': 0, 'uid': '6921267441846830082', 'uniqueId': 'elisabetkst'}]
Maybe this solves your problem:
Defined the following function which unnests any json:
import json
import pandas as pd
def flatten_nested_json_df(df):
df = df.reset_index()
s = (df.applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df.applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
while len(list_columns) > 0 or len(dict_columns) > 0:
new_columns = []
for col in dict_columns:
horiz_exploded = pd.json_normalize(df[col]).add_prefix(f'{col}.')
horiz_exploded.index = df.index
df = pd.concat([df, horiz_exploded], axis=1).drop(columns=[col])
new_columns.extend(horiz_exploded.columns) # inplace
for col in list_columns:
#print(f"exploding: {col}")
df = df.drop(columns=[col]).join(df[col].explode().to_frame())
new_columns.append(col)
s = (df[new_columns].applymap(type) == list).all()
list_columns = s[s].index.tolist()
s = (df[new_columns].applymap(type) == dict).all()
dict_columns = s[s].index.tolist()
return df
and do this:
results = pd.json_normalize(data)
df = pd.DataFrame(results)
outdf = flatten_nested_json_df(df)
which returns:
index cid createTime \
0 0 7000061798167266075 1629828926
1 1 6999869922566783771 1629784228
2 2 7000248857603588891 1629872457
3 3 7000322023545455387 1629889491
4 4 7001271117180977947 1630110475
text diggCount \
0 Done 🥰 1
1 haloo min, yg udah ikutan di misi sebelumnya b... 1
2 bell bottoms maksudnya apa kak?\napakah benar ... 0
3 syudah🥰 0
4 kak, perpanjang dong waktu posting videonya :) 1
replyCommentTotal uid uniqueId
0 0 6529379534374092801 alikayanti
1 1 6842932775562642433 fia_654
2 2 6960940768727417857 keterimadiptn1
3 0 6806645499672839170 miftahulhaqqu
4 0 6921267441846830082 elisabetkst
I found the solution is through multiple for loop after we looping the row of the targeted column,and after that append it as a list
unique_id = []
cid = []
createTime = []
text = []
diggCount = []
replyCommentTotal = []
uid = []
for items in df2["comments"]:
for x in items:
unique_id.append(x["uniqueId"])
cid.append(x["cid"])
createTime.append(x["createTime"])
text.append(x["text"])
diggCount.append(x["diggCount"])
replyCommentTotal.append(x["replyCommentTotal"])
uid.append(x["uid"])
data = pd.DataFrame({'unique_id':unique_id,
'cid':cid,
'createTime':createTime,
'text':text,
'diggCount':diggCount,
'replyCommentTotal':replyCommentTotal,
'uid':uid})

Split dataframe based on pandas groupby and generate multiple PDFs

There's a table containing a list of 3 employees and 3-4 courses they are supposed to take respectively. I want to create individual PDFs for every employee in this table. First PDF will have list of 3 courses to be taken by Emp1, 2nd PDF will have list of 3 courses to be taken by Emp2 and so on.
The below code is creating just 1 PDF and contains list of all courses for all employees together.
My idea is to initially split/ group the data based on EmpNo and then create individual PDF and to do this I need to create a For Loop for iterating. However, I am unable to figure this...
DataFrame Code
pip install fpdf #To generate PDF
import pandas as pd
data = {'EmpNo': ['123','123','123','456','456', '456','456','789','789','789'],
'First Name': ['John', 'John', 'John', 'Jane', 'Jane', 'Jane', 'Jane', 'Danny', 'Danny', 'Danny'],
'Last Name': ['Doe', 'Doe' ,'Doe', 'Doe' ,'Doe', 'Doe', 'Doe', 'Roberts', 'Roberts', 'Roberts'],
'Activity Code': ['HR-CONF-1', 'HR-Field-NH-ONB','COEATT-2021','HR-HBK-CA-1','HR-WD-EMP','HR-LIST-1','HS-Guide-3','HR-WD-EMP','HR-LIST-1','HS-Guide-3'],
'RegistrationDate': ['11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021','11/22/2021', '11/22/2021', '11/22/2021','11/22/2021']}
df = pd.DataFrame(data = data, columns = ['EmpNo','First Name', 'Last Name', 'Activity Code', 'RegistrationDate'])
employees = data['EmpNo']
employees = data.drop_duplicates(subset=['EmpNo'])
print(df)
Input looks like this,
PDF Generation Code
from fpdf import FPDF
class PDF(FPDF):
def header(self):
# Arial bold 15
self.set_font('Helvetica', 'B', 15)
# Move to the right
self.cell(80)
# Title
self.cell(42, 2, 'Plan', 0, 0, 'C')
# Line break
self.ln(20)
# Page footer
def footer(self):
# Position at 1.5 cm from bottom
self.set_y(-15)
# Arial italic 8
self.set_font('Helvetica', 'I', 8)
# Page number
self.cell(0, 10, 'Page ' + str(self.page_no()) + '/{nb}', 0, 0, 'C')
# Footer image First is horizontal, second is vertical, third is size
for EmpNo in employees['EmpNo']:
print (EmpNo)
# Instantiation of inherited class
pdf = PDF()
pdf.alias_nb_pages()
pdf.add_page()
pdf.set_font('Helvetica', '', 11)
pdf.cell(80, 6, 'Employee ID: ' + str(data.loc[0]['EmpNo']), 0, 1, 'L')
pdf.ln(2.5)
pdf.multi_cell(160, 5, 'Dear ' + str(data.loc[0]['First Name']) + ' ' + str(data.loc[0]['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.set_font('Helvetica', 'B', 13)
pdf.cell(80, 6, 'Name', 0, 0, 'L')
pdf.cell(40, 6, 'Date', 0, 0, 'L')
pdf.cell(40, 6, 'Link', 0, 1, 'L')
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.set_font('Helvetica', '', 8)
for i in range (len(data)):
pdf.set_font('Helvetica', '', 8)
pdf.cell(80, 6, data.loc[0+i]['Activity Code'], 0, 0, 'L')
#pdf.cell(40, 6, data.loc[0+i]['Activity Link'], 0, 1, 'L')
pdf.cell(40, 6, data.loc[0+i]['RegistrationDate'], 0, 0, 'L')
pdf.set_font('Helvetica', 'U', 8)
pdf.cell(40, 6, 'Click Here', 0, 1, 'L', link = 'www.google.com')
pdf.set_font('Helvetica', 'B', 10)
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.cell(80, 6, 'IF YOU REQUIRE ANY HELP, PLEASE CONTACT US', 0, 0, 'L')
pdf.output(str(data.loc[0]['First Name']) + ' ' + str(data.loc[0]['Last Name'])+ '.pdf', 'F')
Here's a snap of PDF generated.
I can split the data using below code, but I am stuck at how to call out individual splits and then further create multiple PDF
splits = list(data.groupby('EmpNo'))
Any help would be greatly appreciated. Thanks.
I would write the groupby like this:
for EmpNo, data in df.groupby("EmpNo"):
For each group, the groupby will return the variable it groups on, and the dataframe which matches that variable.
Next, I would extract the first row of that dataframe. This is to make it easier to get the name and similar attributes.
first_row = data.iloc[0]
(What's the difference between iloc and loc?)
Since we have the employee ID already, we can skip looking it up in the dataframe. For other attributes, we can look it up like first_row['First Name'].
pdf.cell(80, 6, 'Employee ID: ' + str(EmpNo), 0, 1, 'L')
# ...
pdf.multi_cell(160, 5, 'Dear ' + str(first_row['First Name']) + ' ' + str(first_row['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')
Next, in this loop which loops over the subset, I would use .iterrows() to do the loop instead of using range() and .loc. This is easier and won't break if the index of your dataframe doesn't start with zero. (After grouping, the second group's index won't start with zero anymore.)
Here is the final source code after the changes:
import pandas as pd
data = {'EmpNo': ['123','123','123','456','456', '456','456','789','789','789'],
'First Name': ['John', 'John', 'John', 'Jane', 'Jane', 'Jane', 'Jane', 'Danny', 'Danny', 'Danny'],
'Last Name': ['Doe', 'Doe' ,'Doe', 'Doe' ,'Doe', 'Doe', 'Doe', 'Roberts', 'Roberts', 'Roberts'],
'Activity Code': ['HR-CONF-1', 'HR-Field-NH-ONB','COEATT-2021','HR-HBK-CA-1','HR-WD-EMP','HR-LIST-1','HS-Guide-3','HR-WD-EMP','HR-LIST-1','HS-Guide-3'],
'RegistrationDate': ['11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021', '11/22/2021','11/22/2021', '11/22/2021', '11/22/2021','11/22/2021']}
df = pd.DataFrame(data = data, columns = ['EmpNo','First Name', 'Last Name', 'Activity Code', 'RegistrationDate'])
from fpdf import FPDF
class PDF(FPDF):
def header(self):
# Arial bold 15
self.set_font('Helvetica', 'B', 15)
# Move to the right
self.cell(80)
# Title
self.cell(42, 2, 'Plan', 0, 0, 'C')
# Line break
self.ln(20)
# Page footer
def footer(self):
# Position at 1.5 cm from bottom
self.set_y(-15)
# Arial italic 8
self.set_font('Helvetica', 'I', 8)
# Page number
self.cell(0, 10, 'Page ' + str(self.page_no()) + '/{nb}', 0, 0, 'C')
# Footer image First is horizontal, second is vertical, third is size
for EmpNo, data in df.groupby("EmpNo"):
# Get first row of grouped dataframe
first_row = data.iloc[0]
# Instantiation of inherited class
pdf = PDF()
pdf.alias_nb_pages()
pdf.add_page()
pdf.set_font('Helvetica', '', 11)
pdf.cell(80, 6, 'Employee ID: ' + str(EmpNo), 0, 1, 'L')
pdf.ln(2.5)
pdf.multi_cell(160, 5, 'Dear ' + str(first_row['First Name']) + ' ' + str(first_row['Last Name']) + ', Please find below your Plan.', 0, 1, 'L')
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.set_font('Helvetica', 'B', 13)
pdf.cell(80, 6, 'Name', 0, 0, 'L')
pdf.cell(40, 6, 'Date', 0, 0, 'L')
pdf.cell(40, 6, 'Link', 0, 1, 'L')
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.set_font('Helvetica', '', 8)
for _, row in data.iterrows():
pdf.set_font('Helvetica', '', 8)
pdf.cell(80, 6, row['Activity Code'], 0, 0, 'L')
#pdf.cell(40, 6, row['Activity Link'], 0, 1, 'L')
pdf.cell(40, 6, row['RegistrationDate'], 0, 0, 'L')
pdf.set_font('Helvetica', 'U', 8)
pdf.cell(40, 6, 'Click Here', 0, 1, 'L', link = 'www.google.com')
pdf.set_font('Helvetica', 'B', 10)
pdf.cell(80, 6, '', 0, 1, 'C')
pdf.cell(80, 6, 'IF YOU REQUIRE ANY HELP, PLEASE CONTACT US', 0, 0, 'L')
pdf.output(str(first_row['First Name']) + ' ' + str(first_row['Last Name'])+ '.pdf', 'F')
Tested, and it works.

XlsxWriter set format on formula cells

After working on this for far to long.
How do I set the format of a cell that I either have written a formula to or will be writing a formula to?
Every other write_(), except write_formula(), includes a format parameter.
for example:
ws.write_number(1,1,quantity, fmt)
ws.write_number(1,2,price, fmt)
# ws.write_formula("C1","=A1*B1",fmt) <-- doesn't exists
ws.write_formula("C1","=A1*B1")
This works:
extendedprice = (quantity*price)
ws.write_formula("C1", "=A1*B1", extendedprice, fmt)
I even figured out I can:
ws.write_number(1,1,quantity, fmt)
if (<price has an error>):
ws.write_number(1,2,"n/a",fmt)
ws.write_formula("C1", "=A1*B1", "n/a", fmt)
else:
ws.write_number(1,2,price,fmt)
ws.write_formula("C1", "=A1*B1", (quantity*price), fmt)
A format can be applied to a formula with XlsxWriter in the same way as any other data type:
import xlsxwriter
workbook = xlsxwriter.Workbook('test.xlsx')
worksheet = workbook.add_worksheet()
my_format = workbook.add_format({'bold': True, 'color': 'red'})
worksheet.write(0, 0, 'Hello', my_format)
worksheet.write(1, 0, 123, my_format)
worksheet.write(2, 0, '=1+1', my_format)
worksheet.write('A4', 'World', my_format)
worksheet.write('A5', 456, my_format)
worksheet.write('A6', '=2+2', my_format)
workbook.close()
Output:

Odd Behaviour of Loop in Python

I am writing a script to report statistics from a text file in Markdown. The file contains book titles and dates. Each date belongs to the titles that follow, until a new date appears. Here is a sample:
#### 8/23/05
Defining the World (Hitchings)
#### 8/26/05
Lost Japan
#### 9/5/05
The Kite Runner
*The Dark Valley (Brendon)*
#### 9/9/05
Active Liberty
I iterate over lines in the file with a for loop and examine each line to see if it's a date. If it's a date, I set a variable this_date. If it's a title, I make it into a dict with the current value of this_date.
There are two exceptions: the file starts with titles, not a date, so I set an initial value for this_date before the for loop. And halfway through the file there is a region where dates were lost, and I set a specific date for those titles.
But in the resulting list of dicts, all the titles are given that date until the lost-data region starts. After that point, the rest of the titles are given the date that appears last in the file. What is most confusing: when I print the contents of this_date right before appending the new dict, it contains the correct value on every loop.
I expect this_date to be visible at all levels of the loop. I know I need to break this up into functions, and passing results explicitly between functions will probably fix the issue, but I'd like to know why this approach didn't work. Thank you very much.
result = []
# regex patterns
ddp = re.compile('\d+') # extract digits
mp = re.compile('^#+\s*\d+') # captures hashes and spaces
dp = re.compile('/\d+/') # captures slashes
yp = re.compile('\d+$')
sp = re.compile('^\*')
# initialize
this_date = {
'month': 4,
'day': 30,
'year': 2005
}
# print('this_date initialized')
for line in text:
if line == '':
pass
else:
if '#' in line: # markdown header format - line is a new date
if 'Reconstructing lost data' in line: # handle exception
# titles after this line are given 12/31/14 (the last date in the file) instead of 8/31/10
# all prior dates are overwritten with 8/31/10
# but the intent is that titles after this line appears have date 8/31/10, until the next date
this_date = {
'month': 8,
'day': 31,
'year': 2010
}
# print('set this_date to handle exception')
else: # get the date from the header
month = ddp.search( mp.search(line).group() ) # digits only
day = ddp.search( dp.search(line).group() ) # digits only
year = yp.search(line)
if month and day and year:
# print('setting this_date within header parse')
this_date['month'] = int(month.group())
this_date['day'] = int(day.group())
this_date['year'] = ( int(year.group()) + 2000 )
else:
pass
else: # line is a title
x = {
'date': this_date,
'read': False
}
if sp.match(line): # starts with asterisk - has been read
x['read'] = True
x['title'] = line[1:-3] # trim trailing asterisk and spaces
else:
x['title'] = line
# this_date is correct when printed here
# print('this_date is ' + str(this_date['month']) + '/' + str(this_date['day']) + '/' + str(this_date['year']) )
result.append(x)
# x has correct date when printed here
# print(x)
# print("Done; found %d titles.") % len(result)
# elements of result have wrong dates (either 8/31/10 or 12/31/14, no other values) when printed here
# print( result[0::20])
You create the this_date dictionary just once. You then reuse that dictionary each loop iteration. You are only adding references to that dictionary to your result list; it is just the one dictionary referenced over and over again.
Store a new copy of the dictionary each loop iteration:
x = {
'date': this_date.copy(),
'read': False
}
Your code could do with some simplification; I'd use datetime.date() objects here instead as they model dates properly. No regular expressions are required:
from datetime import datetime
current_date = None
results = []
for line in text:
line = line.strip()
if not line:
continue
if line.startswith('#'):
current_date = datetime.strptime(line.strip('# '), '%m/%d/%y').date()
continue
entry = {'date': current_date, 'read': False}
if line.startswith('*') and line.endswith('*'):
# previously read
line = line.strip('*')
entry['read'] = True
entry['title'] = line
results.append(entry)
Because datetime.date() objects are immutable and we create a new date object each time we encounter a header line, you can safely re-use the last-read date.
Demo:
>>> from datetime import datetime
>>> from pprint import pprint
>>> text = '''\
... #### 8/23/05
... Defining the World (Hitchings)
... #### 8/26/05
... Lost Japan
... #### 9/5/05
... The Kite Runner
... *The Dark Valley (Brendon)*
... #### 9/9/05
... Active Liberty
... '''.splitlines(True)
>>> current_date = None
>>> results = []
>>> for line in text:
... line = line.strip()
... if not line:
... continue
... if line.startswith('#'):
... current_date = datetime.strptime(line.strip('# '), '%m/%d/%y').date()
... continue
... entry = {'date': current_date, 'read': False}
... if line.startswith('*') and line.endswith('*'):
... # previously read
... line = line.strip('*')
... entry['read'] = True
... entry['title'] = line
... results.append(entry)
...
>>> pprint(results)
[{'date': datetime.date(2005, 8, 23),
'read': False,
'title': 'Defining the World (Hitchings)'},
{'date': datetime.date(2005, 8, 26), 'read': False, 'title': 'Lost Japan'},
{'date': datetime.date(2005, 9, 5),
'read': False,
'title': 'The Kite Runner'},
{'date': datetime.date(2005, 9, 5),
'read': True,
'title': 'The Dark Valley (Brendon)'},
{'date': datetime.date(2005, 9, 9), 'read': False, 'title': 'Active Liberty'}]

Dates to categories

I have an Excel spreadsheet I'm preparing to migrate to Access and the date column has entries in multiple formats such as: 1963 to 1969, Aug. 1968 to Sept. 1968, 1972, Mar-73, 24-Jul, Oct. 2, 1980, Aug 29, 1980, July 1946, etc. and 'undated'. I'm pulling the column that will be the key (map number) and date column into a csv and writing back to a csv.
I can strip out years that are 4 digit, but not ranges. And I'm stumped how to extract days and 2 digit years short of re-formatting by hand. My code isn't very elegant and probably not best practice:
import csv, xlwt, re
# create new Excel document and add sheet
# from tempfile import TemporaryFile
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"Year")
sheet1.write(0,1,"Map")
sheet1.write(0,2,"As Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
with open('C:\dateTestMSDOs.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = '0000'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# search and write instances of four consecutive digits
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"
I would like to write day and month to an additional column as well as pull the second 4 digit date of range entries.
You can try using dateutil for this. I think you'd still need to deal with some of the more difficult formats in a different way though. See a sample implementation below:
Code:
import dateutil.parser as dateparser
date_list = ['1963 to 1969',
'Aug. 1968 to Sept. 1968',
'Mar-73',
'24-Jul',
'Oct. 2 1980',
'Aug 29, 1980',
'July 1946',
'undated']
for d in date_list:
if 'to' in d:
a, b = d.split('to')
# Get the higher number. Use min to get lower of two.
print max(dateparser.parse(a.strip()).year, dateparser.parse(b.strip()).year)
elif d == 'undated':
print '0000'
else:
yr = dateparser.parse(d).year
print yr
Result:
1969
1968
1973
2014
1980
1980
1946
0000
[Finished in 0.4s]
Only glaring issue I can see is that 24-Jul returns a date of 2014 because the parser assumes the current day, month, or year in place of missing component, ie. Mar-73 will become 1973-03-20 if today is the 20th of the month, etc.
Not entirely sure if this is what you were going for or not but I just used a "simple" regex search and then traversed through the sets of groups that matched, applying the given function defined. If a match is found then the function that is called (found in the regex_groups variable) should return a dictionary with the following keys: start_day, start_month, start_year, end_day, end_month, end_year
Then you can do whatever you'd like with those values. Definitely not the cleanest solution but it works, as far as I can tell.
#!/usr/local/bin/python2.7
import re
# Crazy regex
regex_pattern = '(?:(\d{4}) to (\d{4}))|(?:(\w+)\. (\d{4}) to (\w+)\. (\d{4}))|(?:(\w+)-(\d{2}))|(?:(\d{2})-(\w+))|(?:(\w+)\. (\d+), (\d{4}))|(?:(\w+) (\d+), (\d{4}))|(?:(\w+) (\d{4}))|(?:(\d{4}))'
date_strings = [
'1963 to 1969',
'Aug. 1968 to Sept. 1968',
'1972',
'Mar-73',
'24-Jul',
'Oct. 2, 1980',
'Aug 29, 1980',
'July 1946',
]
# Here you set the group matching functions that will be called for a matching group
regex_groups = {
(1,2): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': group_matches[1]
},
(3,4,5,6): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': group_matches[2], 'end_year': group_matches[3]
},
(7,8): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(9,10): lambda group_matches: {
'start_day': group_matches[1], 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
(11,12,13): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(14,15,16): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(17,18): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(19,): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
}
for ds in date_strings:
matches = re.search(regex_pattern, ds)
start_month = ''
start_year = ''
end_month = ''
end_year = ''
for regex_group, group_func in regex_groups.items():
group_matches = [matches.group(sub_group_num) for sub_group_num in regex_group]
if all(group_matches):
match_data = group_func(group_matches)
print
print 'Matched:', ds
print '%s to %s' % ('-'.join([match_data['start_day'], match_data['start_month'], match_data['start_year']]), '-'.join([match_data['end_day'], match_data['end_month'], match_data['end_year']]))
# match_data is a dictionary with keys:
# * start_day
# * start_month
# * start_year
# * end_day
# * end_month
# * end_year
# If a group doesn't contain one of those items, then it is set to a blank string
Outputs:
Matched: 1963 to 1969
--1963 to --1969
Matched: Aug. 1968 to Sept. 1968
-Aug-1968 to -Sept-1968
Matched: 1972
--1972 to --
Matched: Mar-73
-Mar-73 to --
Matched: 24-Jul
Jul--24 to --
Matched: Oct. 2, 1980
2-Oct-1980 to --
Matched: Aug 29, 1980
29-Aug-1980 to --
Matched: July 1946
-July-1946 to --
You could define all the possible cases of dates using regex, something like:
import re
s = ['1963 to 1969', 'Aug. 1968 to Sept. 1968',
'1972', 'Mar-73', '03-Jun', '24-Jul', 'Oct. 2, 1980', 'Oct. 26, 1980',
'Aug 29 1980', 'July 1946']
def get_year(date):
mm = re.findall("\d{4}", date)
if mm:
return mm
mm = re.search("\w+-(\d{2})", date)
if mm:
return [mm.group(1)]
def get_month(date):
mm = re.findall("[A-Z][a-z]+", date)
if mm:
return mm
def get_day(date):
d_expr = ["(\d|\d{2})\-[A-Z][a-z]+","[A-Z][a-z]+[\. ]+(\d|\d{2}),"]
for expr in d_expr:
mm = re.search(expr, date)
if mm:
return [mm.group(1)]
d = {}
m = {}
y = {}
for idx, date in enumerate(s):
d[idx] = get_day(date)
m[idx] = get_month(date)
y[idx] = get_year(date)
print "Year Dict: ", y
print "Month Dict: ", m
print "Day Dict: ", d
As result you get dictionaries of days, month, and years. They could be used to populate the rows.
Output:
Year Dict: {0: ['1963', '1969'], 1: ['1968', '1968'], 2: ['1972'], 3: ['73'], 4: None, 5: None, 6: ['1980'], 7: ['1980'], 8: ['1980'], 9: ['1946']}
Month Dict: {0: None, 1: ['Aug', 'Sept'], 2: None, 3: ['Mar'], 4: ['Jun'], 5: ['Jul'], 6: ['Oct'], 7: ['Oct'], 8: ['Aug'], 9: ['July']}
Day Dict: {0: None, 1: None, 2: None, 3: None, 4: ['03'], 5: ['24'], 6: ['2'], 7: ['26'], 8: None, 9: None}
Thank you for the innovative suggestions. After consideration we decided to remove day and month from what would be searchable in our database, since only a relatively small amount of our data had that level of detail. Here is the code I use to extract and generate the data I needed from a long and messy list.
import csv, xlwt, re
# create new Excel document and add sheet
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"MapYear_(Parsed)")
sheet1.write(0,1,"Map_Number")
sheet1.write(0,2,"As_Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
yearStr = ''
with open('C:\mapsDateFix.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = 'undated'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
if yearStr != 'undated':
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
# search and write instances of four consecutive digits
if yearStr != dateRaw:
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"

Categories

Resources