I have an Excel spreadsheet I'm preparing to migrate to Access and the date column has entries in multiple formats such as: 1963 to 1969, Aug. 1968 to Sept. 1968, 1972, Mar-73, 24-Jul, Oct. 2, 1980, Aug 29, 1980, July 1946, etc. and 'undated'. I'm pulling the column that will be the key (map number) and date column into a csv and writing back to a csv.
I can strip out years that are 4 digit, but not ranges. And I'm stumped how to extract days and 2 digit years short of re-formatting by hand. My code isn't very elegant and probably not best practice:
import csv, xlwt, re
# create new Excel document and add sheet
# from tempfile import TemporaryFile
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"Year")
sheet1.write(0,1,"Map")
sheet1.write(0,2,"As Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
with open('C:\dateTestMSDOs.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = '0000'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# search and write instances of four consecutive digits
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"
I would like to write day and month to an additional column as well as pull the second 4 digit date of range entries.
You can try using dateutil for this. I think you'd still need to deal with some of the more difficult formats in a different way though. See a sample implementation below:
Code:
import dateutil.parser as dateparser
date_list = ['1963 to 1969',
'Aug. 1968 to Sept. 1968',
'Mar-73',
'24-Jul',
'Oct. 2 1980',
'Aug 29, 1980',
'July 1946',
'undated']
for d in date_list:
if 'to' in d:
a, b = d.split('to')
# Get the higher number. Use min to get lower of two.
print max(dateparser.parse(a.strip()).year, dateparser.parse(b.strip()).year)
elif d == 'undated':
print '0000'
else:
yr = dateparser.parse(d).year
print yr
Result:
1969
1968
1973
2014
1980
1980
1946
0000
[Finished in 0.4s]
Only glaring issue I can see is that 24-Jul returns a date of 2014 because the parser assumes the current day, month, or year in place of missing component, ie. Mar-73 will become 1973-03-20 if today is the 20th of the month, etc.
Not entirely sure if this is what you were going for or not but I just used a "simple" regex search and then traversed through the sets of groups that matched, applying the given function defined. If a match is found then the function that is called (found in the regex_groups variable) should return a dictionary with the following keys: start_day, start_month, start_year, end_day, end_month, end_year
Then you can do whatever you'd like with those values. Definitely not the cleanest solution but it works, as far as I can tell.
#!/usr/local/bin/python2.7
import re
# Crazy regex
regex_pattern = '(?:(\d{4}) to (\d{4}))|(?:(\w+)\. (\d{4}) to (\w+)\. (\d{4}))|(?:(\w+)-(\d{2}))|(?:(\d{2})-(\w+))|(?:(\w+)\. (\d+), (\d{4}))|(?:(\w+) (\d+), (\d{4}))|(?:(\w+) (\d{4}))|(?:(\d{4}))'
date_strings = [
'1963 to 1969',
'Aug. 1968 to Sept. 1968',
'1972',
'Mar-73',
'24-Jul',
'Oct. 2, 1980',
'Aug 29, 1980',
'July 1946',
]
# Here you set the group matching functions that will be called for a matching group
regex_groups = {
(1,2): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': group_matches[1]
},
(3,4,5,6): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': group_matches[2], 'end_year': group_matches[3]
},
(7,8): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(9,10): lambda group_matches: {
'start_day': group_matches[1], 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
(11,12,13): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(14,15,16): lambda group_matches: {
'start_day': group_matches[1], 'start_month': group_matches[0], 'start_year': group_matches[2],
'end_day': '', 'end_month': '', 'end_year': ''
},
(17,18): lambda group_matches: {
'start_day': '', 'start_month': group_matches[0], 'start_year': group_matches[1],
'end_day': '', 'end_month': '', 'end_year': ''
},
(19,): lambda group_matches: {
'start_day': '', 'start_month': '', 'start_year': group_matches[0],
'end_day': '', 'end_month': '', 'end_year': ''
},
}
for ds in date_strings:
matches = re.search(regex_pattern, ds)
start_month = ''
start_year = ''
end_month = ''
end_year = ''
for regex_group, group_func in regex_groups.items():
group_matches = [matches.group(sub_group_num) for sub_group_num in regex_group]
if all(group_matches):
match_data = group_func(group_matches)
print
print 'Matched:', ds
print '%s to %s' % ('-'.join([match_data['start_day'], match_data['start_month'], match_data['start_year']]), '-'.join([match_data['end_day'], match_data['end_month'], match_data['end_year']]))
# match_data is a dictionary with keys:
# * start_day
# * start_month
# * start_year
# * end_day
# * end_month
# * end_year
# If a group doesn't contain one of those items, then it is set to a blank string
Outputs:
Matched: 1963 to 1969
--1963 to --1969
Matched: Aug. 1968 to Sept. 1968
-Aug-1968 to -Sept-1968
Matched: 1972
--1972 to --
Matched: Mar-73
-Mar-73 to --
Matched: 24-Jul
Jul--24 to --
Matched: Oct. 2, 1980
2-Oct-1980 to --
Matched: Aug 29, 1980
29-Aug-1980 to --
Matched: July 1946
-July-1946 to --
You could define all the possible cases of dates using regex, something like:
import re
s = ['1963 to 1969', 'Aug. 1968 to Sept. 1968',
'1972', 'Mar-73', '03-Jun', '24-Jul', 'Oct. 2, 1980', 'Oct. 26, 1980',
'Aug 29 1980', 'July 1946']
def get_year(date):
mm = re.findall("\d{4}", date)
if mm:
return mm
mm = re.search("\w+-(\d{2})", date)
if mm:
return [mm.group(1)]
def get_month(date):
mm = re.findall("[A-Z][a-z]+", date)
if mm:
return mm
def get_day(date):
d_expr = ["(\d|\d{2})\-[A-Z][a-z]+","[A-Z][a-z]+[\. ]+(\d|\d{2}),"]
for expr in d_expr:
mm = re.search(expr, date)
if mm:
return [mm.group(1)]
d = {}
m = {}
y = {}
for idx, date in enumerate(s):
d[idx] = get_day(date)
m[idx] = get_month(date)
y[idx] = get_year(date)
print "Year Dict: ", y
print "Month Dict: ", m
print "Day Dict: ", d
As result you get dictionaries of days, month, and years. They could be used to populate the rows.
Output:
Year Dict: {0: ['1963', '1969'], 1: ['1968', '1968'], 2: ['1972'], 3: ['73'], 4: None, 5: None, 6: ['1980'], 7: ['1980'], 8: ['1980'], 9: ['1946']}
Month Dict: {0: None, 1: ['Aug', 'Sept'], 2: None, 3: ['Mar'], 4: ['Jun'], 5: ['Jul'], 6: ['Oct'], 7: ['Oct'], 8: ['Aug'], 9: ['July']}
Day Dict: {0: None, 1: None, 2: None, 3: None, 4: ['03'], 5: ['24'], 6: ['2'], 7: ['26'], 8: None, 9: None}
Thank you for the innovative suggestions. After consideration we decided to remove day and month from what would be searchable in our database, since only a relatively small amount of our data had that level of detail. Here is the code I use to extract and generate the data I needed from a long and messy list.
import csv, xlwt, re
# create new Excel document and add sheet
from xlwt import Workbook
book = Workbook()
sheet1 = book.add_sheet('Sheet 1')
# populate first row with header
sheet1.write(0,0,"MapYear_(Parsed)")
sheet1.write(0,1,"Map_Number")
sheet1.write(0,2,"As_Entered")
# count variable for populating sheet
rowCount=0
# open csv file and read
yearStr = ''
with open('C:\mapsDateFix.csv', 'rb') as f:
reader=csv.reader(f)
for row in reader:
map = row[0] # first row is map number
dateRaw = row[1] # second row is raw date as entered
# write undated and blank entries
if dateRaw == 'undated':
yearStr = 'undated'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
if yearStr != 'undated':
if dateRaw == '':
yearStr = 'NoEntry'
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
#yearStr=''
# search and write instances of four consecutive digits
if yearStr != dateRaw:
try:
year = re.search(r'\d\d\d\d', dateRaw)
yearStr= year.group()
#print yearStr, map, dateRaw
rowCount +=1
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
# if none exist flag for cleaning spreadsheet and print
except:
#print 'Nope', map, dateRaw
rowCount +=1
yearStr='Format'
sheet1.write(rowCount, 0, yearStr)
sheet1.write(rowCount, 1, map)
sheet1.write(rowCount, 2, dateRaw)
#print rowCount, yearStr, map, dateRaw, '\n'
yearStr=''
yearStr=''
dateRaw=''
book.save('D:\dateProperty.xls')
print "Done!"
Related
This is the sample data in a file. I want to split each line in the file and add to a dataframe. In some cases they have more than 1 child. So whenever they have more than one child new set of column have to be added child2 Name and DOB
(P322) Rashmika Chadda 15/05/1995 – Rashmi C 12/02/2024
(P324) Shiva Bhupati 01/01/1994 – Vinitha B 04/08/2024
(P356) Karthikeyan chandrashekar 22/02/1991 – Kanishka P 10/03/2014
(P366) Kalyani Manoj 23/01/1975 - Vandana M 15/05/1995 - Chandana M 18/11/1998
This is the code I have tried but this splits only by taking "-" into consideration
with open("text.txt") as read_file:
file_contents = read_file.readlines()
content_list = []
temp = []
for each_line in file_contents:
temp = each_line.replace("–", " ").split()
content_list.append(temp)
print(content_list)
Current output:
[['(P322)', 'Rashmika', 'Chadda', '15/05/1995', 'Rashmi', 'Chadda', 'Teega', '12/02/2024'], ['(P324)', 'Shiva', 'Bhupati', '01/01/1994', 'Vinitha', 'B', 'Sahu', '04/08/2024'], ['(P356)', 'Karthikeyan', 'chandrashekar', '22/02/1991', 'Kanishka', 'P', '10/03/2014'], ['(P366)', 'Kalyani', 'Manoj', '23/01/1975', '-', 'Vandana', 'M', '15/05/1995', '-', 'Chandana', 'M', '18/11/1998']]
Final output should be like below
Code
Parent_Name
DOB
Child1_Name
DOB
Child2_Name
DOB
P322
Rashmika Chadda
15/05/1995
Rashmi C
12/02/2024
P324
Shiva Bhupati
01/01/1994
Vinitha B
04/08/2024
P356
Karthikeyan chandrashekar
22/02/1991
Kanishka P
10/03/2014
P366
Kalyani Manoj
23/01/1975
Vandana M
15/05/1995
Chandana M
18/11/1998
I'm not sure if you want it as a list or something else.
To get lists:
result = []
for t in text[:]:
# remove the \n at the end of each line
t = t.strip()
# remove the parenthesis you don't wnt
t = t.replace("(", "")
t = t.replace(")", "")
# split on space
t = t.split(" – ")
# reconstruct
for i, person in enumerate(t):
person = person.split(" ")
# print(person)
# remove code
if i==0:
res = [person.pop(0)]
res.extend([" ".join(person[:2]), person[2]])
result.append(res)
print(result)
Which would give the below output:
[['P322', 'Rashmika Chadda', '15/05/1995', 'Rashmi C', '12/02/2024'], ['P324', 'Shiva Bhupati', '01/01/1994', 'Vinitha B', '04/08/2024'], ['P356', 'Karthikeyan chandrashekar', '22/02/1991', 'Kanishka P', '10/03/2014'], ['P366', 'Kalyani Manoj', '23/01/1975', 'Vandana M', '15/05/1995', 'Chandana M', '18/11/1998']]
You can organise a bit more the data using dictionnary:
result = {}
for t in text[:]:
# remove the \n at the end of each line
t = t.strip()
# remove the parenthesis you don't wnt
t = t.replace("(", "")
t = t.replace(")", "")
# split on space
t = t.split(" – ")
for i, person in enumerate(t):
# split name
person = person.split(" ")
# remove code
if i==0:
code = person.pop(0)
if i==0:
result[code] = {"parent_name": " ".join(person[:2]), "parent_DOB": person[2], "children": [] }
else:
result[code]['children'].append({f"child{i}_name": " ".join(person[:2]), f"child{i}_DOB": person[2]})
print(result)
Which would give this output:
{'P322': {'children': [{'child1_DOB': '12/02/2024',
'child1_name': 'Rashmi C'}],
'parent_DOB': '15/05/1995',
'parent_name': 'Rashmika Chadda'},
'P324': {'children': [{'child1_DOB': '04/08/2024',
'child1_name': 'Vinitha B'}],
'parent_DOB': '01/01/1994',
'parent_name': 'Shiva Bhupati'},
'P356': {'children': [{'child1_DOB': '10/03/2014',
'child1_name': 'Kanishka P'}],
'parent_DOB': '22/02/1991',
'parent_name': 'Karthikeyan chandrashekar'},
'P366': {'children': [{'child1_DOB': '15/05/1995',
'child1_name': 'Vandana M'},
{'child2_DOB': '18/11/1998', 'child2_name': 'Chandana M'}],
'parent_DOB': '23/01/1975',
'parent_name': 'Kalyani Manoj'}}
In the end, to have an actual table, you would need to use pandas but that will require for you to fix the number of children max so that you can pad the empty cells.
I have a hard time formatting my csv files in a way easy to process in a pandas dataframe. I am using this https://figshare.com/articles/UMA_ADL_FALL_Dataset_zip/4214283 dataset of fall data to train a RNN model to detect people falling but the formatting is quite hard to clean up with the python csv reader and even with a more intelligent module clevercsv.
this is the code to itterate over the files and merge them into a Dataframe:
import pandas as pd
import zipfile
import clevercsv as csv
csv_list = []
directory = r"C:\Users\20191678\OneDrive - TU Eindhoven\Engineering Design"
for filename in os.listdir(directory):
if '.csv' in filename:
with open(filename, "r", newline="") as fp:
dialect = csv.Sniffer().sniff(fp.read(), verbose=True)
fp.seek(0)
reader = csv.reader(fp, dialect)
rows = list(reader)
csv_list.append(rows)
df = pd.DataFrame(csv_list)
Would be great if anyone can take the time to solve this and make a structured dataframe! Or come up with another idea of cleaning this up.
The csv file code itself:
% Universidad de Malaga - ETSI de Telecomunicacion (Spain)
% Date: 2017-04-14_23:38:23
% ID: Subject_01_ADL_Aplausing_1
% Name: Subject_01
% Age: 67
% Height(cm): 156
% Weight(Kg): 76
% Gender: F
% Type of Movement: ADL
% Type of Movement: FALSE
% Description of the movement: Aplausing
% Trial: 1
% Number of Sensors: 5
% Used Smartphone: LGE-lge-LG-H815-5.1
% Smartphone's Accelerometer: LGE Accelerometer - Vendor: BOSCH
% --> Version: 1
% --> Min - Max Delay: 5000us - 65535000us
% --> Maximum Range: 16.000000263891405 G
% --> Resolution: 1.2136514986004396E-4 G
% SensorTag's Accelerometer: MPU-9250 MEMS MotionTracking Device - Invensense
% --> Maximum Range: 16 G
% --> Resolution: 0.00024 G
% MAC Address; Sensor_ID; Position; Device Model
%f8:95:c7:f3:ba:82; 0; RIGHTPOCKET; lge-LG-H815-5.1
%C4:BE:84:71:A5:02; 2; WAIST; SensorTag
%C4:BE:84:70:0E:80; 3; WRIST; SensorTag
%B0:B4:48:B8:77:03; 4; ANKLE; SensorTag
%C4:BE:84:70:64:8A; 1; CHEST; SensorTag
% Sensor_Type:
% Accelerometer = 0
% Gyroscope = 1
% Magnetometer = 2
% TimeStamp; Sample No; X-Axis; Y-Axis; Z-Axis; Sensor Type; Sensor ID;
102;1;-0.1387496441602707;0.8868721723556519;0.3310287296772003;0;0
102;2;-0.1381397247314453;0.8865065574645996;0.3323715031147003;0;0
102;3;-0.1348443180322647;0.8895576596260071;0.3311501145362854;0;0
102;4;-0.1402153074741364;0.8866279125213623;0.3337142467498779;0;0
102;5;-0.1391168385744095;0.8862622380256653;0.3345684409141541;0;0
102;6;-0.138628289103508;0.8871164321899414;0.3346897959709168;0;0
102;7;-0.1367969810962677;0.8880935311317444;0.3412821888923645;0;0
102;8;-0.138628289103508;0.8883378505706787;0.3398165106773377;0;0
102;9;-0.1409481465816498;0.8901675939559937;0.3401837050914764;0;0
102;10;-0.1418023407459259;0.8891920447349548;0.3418920934200287;0;0
102;11;-0.1430221647024155;0.8882149457931519;0.3420134484767914;0;0
103;12;-0.143510714173317;0.8880935311317444;0.3422577381134033;0;0
103;13;-0.1439992785453796;0.8838210105895996;0.3379867672920227;0;0
103;14;-0.1431450843811035;0.8795484900474548;0.3353012502193451;0;0
103;15;-0.1438763588666916;0.8766187429428101;0.3331027626991272;0;0
103;16;-0.1429008096456528;0.8790599703788757;0.3321272134780884;0;0
103;17;-0.142656534910202;0.8779615163803101;0.3343241512775421;0;0
103;18;-0.1409481465816498;0.8801584243774414;0.3348127007484436;0;0
103;19;-0.1429008096456528;0.8816241025924683;0.3376195728778839;0;0
103;20;-0.1457076668739319;0.8821110725402832;0.3385966718196869;0;0
109;21;-0.1441206336021423;0.8832111358642578;0.3412821888923645;0;0
115;22;-0.1387496441602707;0.8832111358642578;0.3404279947280884;0;0
115;23;-0.1391168385744095;0.8822340369224548;0.3404279947280884;0;0
121;24;-0.1375298053026199;0.8843095898628235;0.3399394154548645;0;0
126;25;-0.1369199007749558;0.8868721723556519;0.337375283241272;0;0
133;26;-0.1375298053026199;0.8854080438613892;0.331394374370575;0;0
Something like this should get you going.
from pprint import pprint
def try_number(s):
try:
if "." in s:
return float(s)
return int(s, 10)
except ValueError:
return s
def read_umafall(fp):
header_lines = []
metadata = {}
data = []
for line in fp:
line = line.strip()
if line.startswith("%"):
if ": " in line:
key, _, value = line[1:].partition(": ")
metadata[key.strip()] = value
else:
header_lines.append(line)
elif ";" in line:
data.append([try_number(c) for c in line.split(";")])
elif line:
print("???", line)
return {
"header_lines": header_lines,
"metadata": metadata,
"data": data,
}
with open(
"UMAFall_Subject_01_ADL_HandsUp_2_2017-04-14_23-33-21.csv",
"r",
) as fp:
result = read_umafall(fp)
pprint(result["metadata"])
pprint(result["header_lines"])
pprint(result["data"][:10])
The output is e.g.
{'--> Maximum Range': '16 G',
'--> Min - Max Delay': '5000us - 65535000us',
'--> Resolution': '0.00024 G',
'--> Version': '1',
'Age': '67',
'Date': '2017-04-14_23:33:21',
'Description of the movement': 'HandsUp',
'Gender': 'F',
'Height(cm)': '156',
'ID': 'Subject_01_ADL_HandsUp_2',
'Name': 'Subject_01',
'Number of Sensors': '5',
"SensorTag's Accelerometer": 'MPU-9250 MEMS MotionTracking Device - '
'Invensense',
"Smartphone's Accelerometer": 'LGE Accelerometer - Vendor: BOSCH',
'Trial': '2',
'Type of Movement': 'FALSE',
'Used Smartphone': 'LGE-lge-LG-H815-5.1',
'Weight(Kg)': '76'}
['% Universidad de Malaga - ETSI de Telecomunicacion (Spain)',
'% MAC Address; Sensor_ID; Position; Device Model',
'%f8:95:c7:f3:ba:82; 0; RIGHTPOCKET; lge-LG-H815-5.1',
'%C4:BE:84:71:A5:02; 2; WAIST; SensorTag',
'%C4:BE:84:70:0E:80; 3; WRIST; SensorTag',
'%B0:B4:48:B8:77:03; 4; ANKLE; SensorTag',
'%C4:BE:84:70:64:8A; 1; CHEST; SensorTag',
'% Sensor_Type:',
'% Accelerometer = 0',
'% Gyroscope = 1',
'% Magnetometer = 2',
'% TimeStamp; Sample No; X-Axis; Y-Axis; Z-Axis; Sensor Type; Sensor ID;']
[[371, 1, -0.01265575457364321, 0.9133599400520325, -0.1938552260398865, 0, 0],
[371, 2, -0.01839394308626652, 0.9126286506652832, -0.1926354020833969, 0, 0],
[371, 3, -0.01802674867212772, 0.9129943251609802, -0.1948323398828507, 0, 0],
[371, 4, -0.02352065965533257, 0.9167782664299011, -0.1969063729047775, 0, 0],
[371, 5, -0.02315346524119377, 0.9209294319152832, -0.2019117176532745, 0, 0],
[371, 6, -0.01888094283640385, 0.9211721420288086, -0.203375831246376, 0, 0],
[371, 7, -0.0208351630717516, 0.9270316958427429, -0.2050857692956924, 0, 0],
[371, 8, -0.01924813725054264, 0.9303271174430847, -0.2070384472608566, 0, 0],
[371, 9, -0.01766111142933369, 0.9342340230941772, -0.2080155462026596, 0, 0],
[371, 10, -0.01265575457364321, 0.9388721585273743, -0.2115552425384522, 0, 0]]
```,
that is
* first the header lines that could be parsed as key-value pairs
* other header lines
* the data
You can hopefully trust each file to have the data in the same order (`TimeStamp; Sample No; X-Axis; Y-Axis; Z-Axis; Sensor Type; Sensor ID`).
I have a file where on each line I have text like this (representing cast of a film):
[{'cast_id': 23, 'character': "Roger 'Verbal' Kint", 'credit_id': '52fe4260c3a36847f8019af7', 'gender': 2, 'id': 1979, 'name': 'Kevin Spacey', 'order': 5, 'profile_path': '/x7wF050iuCASefLLG75s2uDPFUu.jpg'}, {'cast_id': 27, 'character': 'Edie's Finneran', 'credit_id': '52fe4260c3a36847f8019b07', 'gender': 1, 'id': 2179, 'name': 'Suzy Amis', 'order': 6, 'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]
I need to convert it in a valid json string, thus converting only the necessary single quotes to double quotes (e.g. the single quotes around word Verbal must not be converted, eventual apostrophes in the text also should not be converted).
I am using python 3.x. I need to find a regular expression which will convert only the right single quotes to double quotes, thus the whole text resulting in a valid json string. Any idea?
First of all, the line you gave as example is not parsable! … 'Edie's Finneran' … contains a syntax error, not matter what.
Assuming that you have control over the input, you could simply use eval() to read in the file. (Although, in that case one would wonder why you can't produce valid JSON in the first place…)
>>> f = open('list.txt', 'r')
>>> s = f.read().strip()
>>> l = eval(s)
>>> import pprint
>>> pprint.pprint(l)
[{'cast_id': 23,
'character': "Roger 'Verbal' Kint",
...
'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]
>>> import json
>>> json.dumps(l)
'[{"cast_id": 23, "character": "Roger \'Verbal\' Kint", "credit_id": "52fe4260ca36847f8019af7", "gender": 2, "id": 1979, "name": "Kevin Spacey", "order": 5, "rofile_path": "/x7wF050iuCASefLLG75s2uDPFUu.jpg"}, {"cast_id": 27, "character":"Edie\'s Finneran", "credit_id": "52fe4260c3a36847f8019b07", "gender": 1, "id":2179, "name": "Suzy Amis", "order": 6, "profile_path": "/b1pjkncyLuBtMUmqD1MztDSG80.jpg"}]'
If you don't have control over the input, this is very dangerous, as it opens you up to code injection attacks.
I cannot emphasize enough that the best solution would be to produce valid JSON in the first place.
If you do not have control over the JSON data, do not eval() it!
I created a simple JSON correction mechanism, as that is more secure:
def correctSingleQuoteJSON(s):
rstr = ""
escaped = False
for c in s:
if c == "'" and not escaped:
c = '"' # replace single with double quote
elif c == "'" and escaped:
rstr = rstr[:-1] # remove escape character before single quotes
elif c == '"':
c = '\\' + c # escape existing double quotes
escaped = (c == "\\") # check for an escape character
rstr += c # append the correct json
return rstr
You can use the function in the following way:
import json
singleQuoteJson = "[{'cast_id': 23, 'character': 'Roger \\'Verbal\\' Kint', 'credit_id': '52fe4260c3a36847f8019af7', 'gender': 2, 'id': 1979, 'name': 'Kevin Spacey', 'order': 5, 'profile_path': '/x7wF050iuCASefLLG75s2uDPFUu.jpg'}, {'cast_id': 27, 'character': 'Edie\\'s Finneran', 'credit_id': '52fe4260c3a36847f8019b07', 'gender': 1, 'id': 2179, 'name': 'Suzy Amis', 'order': 6, 'profile_path': '/b1pjkncyLuBtMUmqD1MztD2SG80.jpg'}]"
correctJson = correctSingleQuoteJSON(singleQuoteJson)
print(json.loads(correctJson))
Here is the code to get desired output
import ast
def getJson(filepath):
fr = open(filepath, 'r')
lines = []
for line in fr.readlines():
line_split = line.split(",")
set_line_split = []
for i in line_split:
i_split = i.split(":")
i_set_split = []
for split_i in i_split:
set_split_i = ""
rev = ""
i = 0
for ch in split_i:
if ch in ['\"','\'']:
set_split_i += ch
i += 1
break
else:
set_split_i += ch
i += 1
i_rev = (split_i[i:])[::-1]
state = False
for ch in i_rev:
if ch in ['\"','\''] and state == False:
rev += ch
state = True
elif ch in ['\"','\''] and state == True:
rev += ch+"\\"
else:
rev += ch
i_rev = rev[::-1]
set_split_i += i_rev
i_set_split.append(set_split_i)
set_line_split.append(":".join(i_set_split))
line_modified = ",".join(set_line_split)
lines.append(ast.literal_eval(str(line_modified)))
return lines
lines = getJson('test.txt')
for i in lines:
print(i)
Apart from eval() (mentioned in user3850's answer), you can use ast.literal_eval
This has been discussed in the thread: Using python's eval() vs. ast.literal_eval()?
You can also look at the following discussion threads from Kaggle competition which has data similar to the one mentioned by OP:
https://www.kaggle.com/c/tmdb-box-office-prediction/discussion/89313#latest-517927
https://www.kaggle.com/c/tmdb-box-office-prediction/discussion/80045#latest-518338
I am writing a script to report statistics from a text file in Markdown. The file contains book titles and dates. Each date belongs to the titles that follow, until a new date appears. Here is a sample:
#### 8/23/05
Defining the World (Hitchings)
#### 8/26/05
Lost Japan
#### 9/5/05
The Kite Runner
*The Dark Valley (Brendon)*
#### 9/9/05
Active Liberty
I iterate over lines in the file with a for loop and examine each line to see if it's a date. If it's a date, I set a variable this_date. If it's a title, I make it into a dict with the current value of this_date.
There are two exceptions: the file starts with titles, not a date, so I set an initial value for this_date before the for loop. And halfway through the file there is a region where dates were lost, and I set a specific date for those titles.
But in the resulting list of dicts, all the titles are given that date until the lost-data region starts. After that point, the rest of the titles are given the date that appears last in the file. What is most confusing: when I print the contents of this_date right before appending the new dict, it contains the correct value on every loop.
I expect this_date to be visible at all levels of the loop. I know I need to break this up into functions, and passing results explicitly between functions will probably fix the issue, but I'd like to know why this approach didn't work. Thank you very much.
result = []
# regex patterns
ddp = re.compile('\d+') # extract digits
mp = re.compile('^#+\s*\d+') # captures hashes and spaces
dp = re.compile('/\d+/') # captures slashes
yp = re.compile('\d+$')
sp = re.compile('^\*')
# initialize
this_date = {
'month': 4,
'day': 30,
'year': 2005
}
# print('this_date initialized')
for line in text:
if line == '':
pass
else:
if '#' in line: # markdown header format - line is a new date
if 'Reconstructing lost data' in line: # handle exception
# titles after this line are given 12/31/14 (the last date in the file) instead of 8/31/10
# all prior dates are overwritten with 8/31/10
# but the intent is that titles after this line appears have date 8/31/10, until the next date
this_date = {
'month': 8,
'day': 31,
'year': 2010
}
# print('set this_date to handle exception')
else: # get the date from the header
month = ddp.search( mp.search(line).group() ) # digits only
day = ddp.search( dp.search(line).group() ) # digits only
year = yp.search(line)
if month and day and year:
# print('setting this_date within header parse')
this_date['month'] = int(month.group())
this_date['day'] = int(day.group())
this_date['year'] = ( int(year.group()) + 2000 )
else:
pass
else: # line is a title
x = {
'date': this_date,
'read': False
}
if sp.match(line): # starts with asterisk - has been read
x['read'] = True
x['title'] = line[1:-3] # trim trailing asterisk and spaces
else:
x['title'] = line
# this_date is correct when printed here
# print('this_date is ' + str(this_date['month']) + '/' + str(this_date['day']) + '/' + str(this_date['year']) )
result.append(x)
# x has correct date when printed here
# print(x)
# print("Done; found %d titles.") % len(result)
# elements of result have wrong dates (either 8/31/10 or 12/31/14, no other values) when printed here
# print( result[0::20])
You create the this_date dictionary just once. You then reuse that dictionary each loop iteration. You are only adding references to that dictionary to your result list; it is just the one dictionary referenced over and over again.
Store a new copy of the dictionary each loop iteration:
x = {
'date': this_date.copy(),
'read': False
}
Your code could do with some simplification; I'd use datetime.date() objects here instead as they model dates properly. No regular expressions are required:
from datetime import datetime
current_date = None
results = []
for line in text:
line = line.strip()
if not line:
continue
if line.startswith('#'):
current_date = datetime.strptime(line.strip('# '), '%m/%d/%y').date()
continue
entry = {'date': current_date, 'read': False}
if line.startswith('*') and line.endswith('*'):
# previously read
line = line.strip('*')
entry['read'] = True
entry['title'] = line
results.append(entry)
Because datetime.date() objects are immutable and we create a new date object each time we encounter a header line, you can safely re-use the last-read date.
Demo:
>>> from datetime import datetime
>>> from pprint import pprint
>>> text = '''\
... #### 8/23/05
... Defining the World (Hitchings)
... #### 8/26/05
... Lost Japan
... #### 9/5/05
... The Kite Runner
... *The Dark Valley (Brendon)*
... #### 9/9/05
... Active Liberty
... '''.splitlines(True)
>>> current_date = None
>>> results = []
>>> for line in text:
... line = line.strip()
... if not line:
... continue
... if line.startswith('#'):
... current_date = datetime.strptime(line.strip('# '), '%m/%d/%y').date()
... continue
... entry = {'date': current_date, 'read': False}
... if line.startswith('*') and line.endswith('*'):
... # previously read
... line = line.strip('*')
... entry['read'] = True
... entry['title'] = line
... results.append(entry)
...
>>> pprint(results)
[{'date': datetime.date(2005, 8, 23),
'read': False,
'title': 'Defining the World (Hitchings)'},
{'date': datetime.date(2005, 8, 26), 'read': False, 'title': 'Lost Japan'},
{'date': datetime.date(2005, 9, 5),
'read': False,
'title': 'The Kite Runner'},
{'date': datetime.date(2005, 9, 5),
'read': True,
'title': 'The Dark Valley (Brendon)'},
{'date': datetime.date(2005, 9, 9), 'read': False, 'title': 'Active Liberty'}]
I have a list to be exported to an Excel file keeping the appropriate format, I resorted to a library named xlsxwriter,
here is an example :
xlsxwriter
and here is my list :
{'FirstName': u'Forence','LastName': u'Bidorst', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
{'FirstName': u'Oliver','LastName': u'Bidorst', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
{'FirstName': u'Mathew','LastName': u'Stark', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
{'FirstName': u'Sphed','LastName': u'liomst', 'Salary': -6775000.0, 'BirthDate': datetime.datetime(2013, 6, 20, 0, 0)}
I modified the code to browse a list and insert it into the file,
def export_result_XL():
list=get_list()
...
# Write some data headers.
worksheet.write('A1', 'First Name', bold)
worksheet.write('B1', 'Last Name', bold)
worksheet.write('C1', 'Salary', bold)
worksheet.write('D1', 'Birth Date', bold)
# Some data we want to write to the worksheet.
for entry in list:
x = str(entry['FirstName'])
y = str(entry['LastName'])
z = str(entry['Salary'])
e = str(entry['BirthDate'])
v = BirthDate[:10] # because the date format is like yyyy-mm-dd 00:00:00
expenses = (
[x,y ,z ,v]
)
# Start from the first cell below the headers.
row = 1
col = 0
for item ,str1,str2,str3,date_str in (expenses):
# Convert the date string into a datetime object.
date = datetime.strptime(date_str, "%Y-%m-%d")
worksheet.write_string (row, col, str1 )
worksheet.write_string (row, col + 1, str2 )
worksheet.write_string(row, col + 2, str3 )
worksheet.write_datetime(row, col + 3, date, date_format )
row += 1
# Write a total using a formula.
#worksheet.write(row, 0, 'Total', bold)
#worksheet.write(row, 2, '=SUM(C2:C5)', money_format)
workbook.close()
return ''
I had two problems here :
1 -
for item, date_str in (frais)
ValueError: too many values to unpack
2-
if I avoid to convert to date format the file will be genreated but columns and rows are flipped
Is there any Idea how to do it , I hope I was clear in the description
Finally I found a Solution :
row = 1
col = 0
for entry in list:
print entry
strdate=str(entry['BirthDate'])
formatdate=strdate[:10]
date = datetime.strptime(str(formatdate), "%Y-%m-%d")
worksheet.write_string (row, col, entry['FirstName'] )
worksheet.write_string (row, col+1, entry['LastName'] )
worksheet.write_number (row, col+6, entry['Salary'],number_format )
worksheet.write_datetime(row, col+10, date, date_format )
row += 1
workbook.close()