Python Parsing XML to CSV with missing Elements - python

First time trying to parse XML to CSV using Python. I need some help with when I have multiple customers and they do not have the same child elements. When a customer does not have a child element I want the csv file column to be populated with 'Empty'. I want 'Empty' to be a placeholder so that the values that do exist populate in the correct column.
Example of what's happening, notice how data in the second row which is suppose to be in the zipcode, street, and number field are being squeezed into previous columns where no values were found.
!https://imgur.com/olZ9OEZ!
Here's an example of what I'm trying to do, as you will see the 'Empty' is just a place holder:
!https://imgur.com/w5389Kd!
Here's my python code:
import xml.etree.ElementTree as ET
import csv
tree = ET.parse(r'C:\Documents\cat.xml')
root = tree.getroot()
#Open the file for writing
CustomerData = open(r'C:\Users\Kris\Documents\customerdata.csv', 'w')
#Create header row object
header_row = []
#Create the csv writer object
csvwriter = csv.writer(CustomerData)
#Set count to 0
count = 0
#Find tags and text
for node in tree.iter('Customer'):
data = []
if count == 0:
for customerid in node.iter('Id_Customer'):
customer = customerid.tag
header_row.append(customer)
for segmentid in node.iter('Segment'):
segment = segmentid.tag
header_row.append(segment)
for event in node.iter('Event'):
for natureid in event.iter('Nature'):
nature = natureid.tag
header_row.append(nature)
for event2 in node.iter('Event'):
for Extrainfoid in event2.iter('Extrainfo'):
extrainfo = Extrainfoid.tag
header_row.append(extrainfo)
for address in node.iter('Address'):
for zipcode in address.iter('zipcode'):
zipcd = zipcode.tag
header_row.append(zipcd)
for address in node.iter('Address'):
for streetname in address.iter('street'):
street = streetname.tag
header_row.append(street)
for address in node.iter('Address'):
for number in address.iter('number'):
num = number.tag
csvwriter.writerow(header_row)
count = count + 1
for customerid in node.iter('Id_Customer'):
customertxt = customerid.text
data.append(customertxt)
for segmentid in node.iter('Segment'):
segmenttxt = segmentid.text
data.append(segmenttxt)
for event in node.iter('Event'):
for natureid in event.iter('Nature'):
naturetxt = natureid.text
data.append(naturetxt)
for event2 in node.iter('Event'):
for Extrainfoid in event2.iter('Extrainfo'):
extrainfotxt = Extrainfoid.text
data.append(extrainfotxt)
for address in node.iter('Address'):
for zipcode in address.iter('zipcode'):
zipcdtxt = zipcode.text
data.append(zipcdtxt)
for address in node.iter('Address'):
for streetname in address.iter('street'):
streettxt = streetname.text
header_row.append(streettxt)
for address in node.iter('Address'):
for number in address.iter('number'):
numtxt = number.text
data.append(numtxt)
csvwriter.writerow(data)
CustomerData.close()
Here is an example of XML code that is similar to mine with different elements. It's not the real xml code that I'm using, just an example of how a customer can have multiple elements that another customer does not. Please note in my actual process with my xml files the headers and everything are displaying properly in my csv file, I just need to create an 'Empty' when the element does not actually have a value for that particular customer.
<CAT>
<Header>...</Header>
<Add>...</Add>
<Customer>
<Id_Customer>xyz1</Id_Customer>
<Segment>abc1</Segment>
<Event>
<Nature>info1</Nature>
<Extrainfo>info2</Extrainfo>
</Event>
</Customer>
<Customer>
<Id_Customer>zzwy</Id_Customer>
<Segment>c2</Segment>
<Adress>
<zipcode>77098</zipcode>
<street>belaire drive</street>
<number>5</number>
</Adress>
</Customer>
...

You could create a list containing all the mappings you want. Try and search for each, and if it is not present, catch the AttributeError and store an empty value for it:
import xml.etree.ElementTree as ET
import csv
fields = [
('Id_Customer', 'Id_Customer'),
('Segment', 'Segment'),
('Nature', 'Event/Nature'),
('Extrainfo', 'Event/Extrainfo'),
('zipcode', 'Adress/zipcode'),
('street', 'Adress/street'),
('number', 'Adress/number')]
tree = ET.parse('cat.xml')
root = tree.getroot()
with open(r'customerdata.csv', 'wb') as f_customerdata:
csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=[field for field, match in fields])
csv_customerdata.writeheader()
for node in tree.iter('Customer'):
row = {}
for field_name, match in fields:
try:
row[field_name] = node.find(match).text
except AttributeError as e:
row[field_name] = ''
csv_customerdata.writerow(row)
Giving you an output CSV file containing:
Id_Customer,Segment,Nature,Extrainfo,zipcode,street,number
xyz1,abc1,info1,info2,,,
zzwy,c2,,,77098,belaire drive,5
This approach also uses a DictWriter() instead of the standard csv writer. This makes it easier to assign values by name.
To cope with multiple address entries per customer, you first need to autocreate the maximum number of extra columns per entry. Then when accessing the elements, use findall() to get each one:
import xml.etree.ElementTree as ET
import csv
extra_columns = 2
fields = [
('Id_Customer', 'Id_Customer', 1),
('Segment', 'Segment', 1),
('Nature', 'Event/Nature', 1),
('Extrainfo', 'Event/Extrainfo', 1),
('zipcode', 'Adress/zipcode', extra_columns),
('street', 'Adress/street', extra_columns),
('number', 'Adress/number', extra_columns)]
tree = ET.parse('cat.xml')
root = tree.getroot()
# Auto create the header from fields
fieldnames = []
for field, match, cols in fields:
fieldnames.append(field)
if cols > 1:
fieldnames.extend(["{}{}".format(field, x+2) for x in range(extra_columns)])
with open(r'customerdata.csv', 'wb') as f_customerdata:
csv_customerdata = csv.DictWriter(f_customerdata, fieldnames=fieldnames)
csv_customerdata.writeheader()
for node in tree.iter('Customer'):
row = {}
for field_name, match, cols in fields:
if cols > 1:
for index, el in enumerate(node.findall(match)):
try:
if index:
row["{}{}".format(field_name, index+1)] = el.text
else:
row[field_name] = el.text
except AttributeError as e:
row[field_name] = ''
else:
try:
row[field_name] = node.find(match).text
except AttributeError as e:
row[field_name] = ''
csv_customerdata.writerow(row)
So your header would now look like:
Id_Customer,Segment,Nature,Extrainfo,zipcode,zipcode2,zipcode3,street,street2,street3,number,number2,number3

Related

How to convert XML to table?

I'm taking an XML string from a URL. This part works fine.
REQUEST_URL = 'https://URL'
response = requests.get(REQUEST_URL, auth=(login, password))
xml_data = response.text.encode('utf-8', 'ignore')
tree = ET.parse(xml_data)
root = tree.getroot()
print(response.text) gives me:
<?xml version='1.0' standalone='yes'?><Report Type='SLA Report'
SiteName='Execute Query'
SLA_Name='Execute Query'
SLA_Description='Execute Query'
From='2018-11-27 00:00'
Thru='2018-11-27 23:59'
obj_device='4500'
locations='69,31,'
>
<Objective Type='Availability'>
<Goal>99.93</Goal>
<Actual>99.93</Actual>
<Compliant>Yes</Compliant>
<Errors>2</Errors>
<Checks>2878</Checks>
</Objective>
<Objective Type='Uptime'>
<Goal></Goal>
<Actual></Actual>
<Compliant></Compliant>
<Errors>0</Errors>
<Checks>0</Checks>
</Objective>
<Objective Type='Response Time'>
<Goal>300.00</Goal>
<Actual>3.1164</Actual>
<Compliant>Yes</Compliant>
<Errors>0</Errors>
<Checks>2878</Checks>
</Objective>
<MonitoringPeriods>
<Monitor>
<Exclude>No</Exclude><DayFrom>Sunday</DayFrom><TimeFrom>00:00</TimeFrom><DayThru>Sunday</DayThru><TimeThru>23:59</TimeThru>
</Monitor>
I'd like to get the data into a table so it's easier to work with. How can I do this with Python 3.x? When I import it into Excel, it looks great.
It may be something like this:
for sla in root.findall('Objective'):
goal = sla.find('Goal').text
actual = sla.find('Actual').text
compliant = sla.find('Compliant').text
errors = sla.find('Errors').text
checks = sla.find('Checks').text
print('Goal:', goal, 'Actual:', actual, 'Compliant:', compliant, 'Errors:', errors, 'Checks:', checks)
But I want to load each data point into a data frame, not print each data point. How can I do the same thing using Python? TIA.
Sould print this:
# importing csv module
import csv
# csv file name
filename = "aapl.csv"
# initializing the titles and rows list
fields = []
rows = []
# reading csv file
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
for row in csvreader:
rows.append(row)
# get total number of rows
print("Total no. of rows: %d"%(csvreader.line_num))
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
# printing first 5 rows
print('\nFirst 5 rows are:\n')
for row in rows[:5]:
# parsing each column of a row
for col in row:
print("%10s"%col),
print('\n')
Source: https://www.geeksforgeeks.org/working-csv-files-python/

Can't figure out how to properly output my data

I'm a relative novice at python but yet, somehow managed to build a scraper for Instagram. I now want to take this one step further and output the 5 most commonly used hashtags from an IG profile into my CSV output file.
Current output:
I've managed to isolate the 5 most commonly used hashtags, but I get this result in my csv:
[('#striveforgreatness', 3), ('#jamesgang', 3), ('#thekidfromakron',
2), ('#togetherwecanchangetheworld', 1), ('#halloweenchronicles', 1)]
Desired output:
What I'm looking to end up with in the end is having 5 columns at the end of my .CSV outputting the X-th most commonly used value.
So something in the lines of this:
I've Googled for a while and managed to isolate them separately, but I always end up with '('#thekidfromakron', 2)' as an output. I seem to be missing some part of the puzzle :(.
Here is what I'm working with at the moment:
import csv
import requests
from bs4 import BeautifulSoup
import json
import re
import time
from collections import Counter
ts = time.gmtime()
def get_csv_header(top_numb):
fieldnames = ['USER','MEDIA COUNT','FOLLOWERCOUNT','TOTAL LIKES','TOTAL COMMENTS','ER','ER IN %', 'BIO', 'ALL CAPTION TEXT','HASHTAGS COUNTED','MOST COMMON HASHTAGS']
return fieldnames
def write_csv_header(filename, headers):
with open(filename, 'w', newline='') as f_out:
writer = csv.DictWriter(f_out, fieldnames=headers)
writer.writeheader()
return
def read_user_name(t_file):
with open(t_file) as f:
user_list = f.read().splitlines()
return user_list
if __name__ == '__main__':
# HERE YOU CAN SPECIFY YOUR USERLIST FILE NAME,
# Which contains a list of usernames's BY DEFAULT <current working directory>/userlist.txt
USER_FILE = 'userlist.txt'
# HERE YOU CAN SPECIFY YOUR DATA FILE NAME, BY DEFAULT (data.csv)', Where your final result stays
DATA_FILE = 'users_with_er.csv'
MAX_POST = 12 # MAX POST
print('Starting the engagement calculations... Please wait until it finishes!')
users = read_user_name(USER_FILE)
""" Writing data to csv file """
csv_headers = get_csv_header(MAX_POST)
write_csv_header(DATA_FILE, csv_headers)
for user in users:
post_info = {'USER': user}
url = 'https://www.instagram.com/' + user + '/'
#for troubleshooting, un-comment the next two lines:
#print(user)
#print(url)
try:
r = requests.get(url)
if r.status_code != 200:
print(timestamp,' user {0} not found or page unavailable! Skipping...'.format(user))
continue
soup = BeautifulSoup(r.content, "html.parser")
scripts = soup.find_all('script', type="text/javascript", text=re.compile('window._sharedData'))
stringified_json = scripts[0].get_text().replace('window._sharedData = ', '')[:-1]
j = json.loads(stringified_json)['entry_data']['ProfilePage'][0]
timestamp = time.strftime("%d-%m-%Y %H:%M:%S", ts)
except ValueError:
print(timestamp,'ValueError for username {0}...Skipping...'.format(user))
continue
except IndexError as error:
# Output expected IndexErrors.
print(timestamp, error)
continue
if j['graphql']['user']['edge_followed_by']['count'] <=0:
print(timestamp,'user {0} has no followers! Skipping...'.format(user))
continue
if j['graphql']['user']['edge_owner_to_timeline_media']['count'] <12:
print(timestamp,'user {0} has less than 12 posts! Skipping...'.format(user))
continue
if j['graphql']['user']['is_private'] is True:
print(timestamp,'user {0} has a private profile! Skipping...'.format(user))
continue
media_count = j['graphql']['user']['edge_owner_to_timeline_media']['count']
accountname = j['graphql']['user']['username']
followercount = j['graphql']['user']['edge_followed_by']['count']
bio = j['graphql']['user']['biography']
i = 0
total_likes = 0
total_comments = 0
all_captiontext = ''
while i <= 11:
total_likes += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_liked_by']['count']
total_comments += j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_comment']['count']
captions = j['graphql']['user']['edge_owner_to_timeline_media']['edges'][i]['node']['edge_media_to_caption']
caption_detail = captions['edges'][0]['node']['text']
all_captiontext += caption_detail
i += 1
engagement_rate_percentage = '{0:.4f}'.format((((total_likes + total_comments) / followercount)/12)*100) + '%'
engagement_rate = (((total_likes + total_comments) / followercount)/12*100)
#isolate and count hashtags
hashtags = re.findall(r'#\w*', all_captiontext)
hashtags_counted = Counter(hashtags)
most_common = hashtags_counted.most_common(5)
with open('users_with_er.csv', 'a', newline='', encoding='utf-8') as data_out:
print(timestamp,'Writing Data for user {0}...'.format(user))
post_info["USER"] = accountname
post_info["FOLLOWERCOUNT"] = followercount
post_info["MEDIA COUNT"] = media_count
post_info["TOTAL LIKES"] = total_likes
post_info["TOTAL COMMENTS"] = total_comments
post_info["ER"] = engagement_rate
post_info["ER IN %"] = engagement_rate_percentage
post_info["BIO"] = bio
post_info["ALL CAPTION TEXT"] = all_captiontext
post_info["HASHTAGS COUNTED"] = hashtags_counted
csv_writer = csv.DictWriter(data_out, fieldnames=csv_headers)
csv_writer.writerow(post_info)
""" Done with the script """
print('ALL DONE !!!! ')
The code that goes before this simply scrapes the webpage, and compiles all the captions from the last 12 posts into "all_captiontext".
Any help to solve this (probably simple) issue would be greatly appreciated as I've been struggling with this for days (again, I'm a noob :') ).
Replace line
post_info["MOST COMMON HASHTAGS"] = most_common
with:
for i, counter_tuple in enumerate(most_common):
tag_name = counter_tuple[0].replace('#','')
label = "Top %d" % (i + 1)
post_info[label] = tag_name
There's also a bit of code missing. For example, your code doesn't include csv_headers variable, which I suppose would be
csv_headers = post_info.keys()
It also seems that you're opening a file to write just one row. I don't think that's intended, so what you would like to do is to collect the results into a list of dictionaries. A cleaner solution would be to use pandas' dataframe, which you can output straight into a csv file.
most_common being the output of the call to hashtags_counted.most_common, I had a look at the doc here: https://docs.python.org/2/library/collections.html#collections.Counter.most_common
Output if formatted the following : [(key, value), (key, value), ...] and ordered in decreasing importance of number of occurences.
Hence, to get only the name and not the number of occurence, you should replace:
post_info["MOST COMMON HASHTAGS"] = most_common
by
post_info["MOST COMMON HASHTAGS"] = [x[0] for x in most_common]
You have a list of tuple. This statement builds on the fly the list of the first element of each tuple, keeping the sorting order.

Shaking off duplicates while parsing

I've made a parser written in python which is doing it's job perfectly except for some duplicates coming along. Moreover, when I open csv file I can see that every result is surrounded by square braces. Is there any workaround to get rid of duplicates data and square braces on the fly? Here is what I tried with:
import csv
import requests
from lxml import html
def parsingdata(mpg):
data = set()
outfile=open('RealYP.csv','w',newline='')
writer=csv.writer(outfile)
writer.writerow(["Name","Address","Phone"])
pg=1
while pg<=mpg:
url="https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page="+str(pg)
page=requests.get(url)
tree=html.fromstring(page.text)
titles = tree.xpath('//div[#class="info"]')
items = []
for title in titles:
comb = []
Name = title.xpath('.//span[#itemprop="name"]/text()')
Address = title.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()')
Phone = title.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()')
try:
comb.append(Name[0])
comb.append(Address[0])
comb.append(Phone[0])
except:
continue
items.append(comb)
pg+=1
for item in items:
writer.writerow(item)
parsingdata(3)
Now it is working fine.
Edit: Rectified portion taken from bjpreisler
This script removes dups when I am working with a .csv file. Check if this works for you :)
with open(file_out, 'w') as f_out, open(file_in, 'r') as f_in:
# write rows from in-file to out-file until all the data is written
checkDups = set() # set for removing duplicates
for line in f_in:
if line in checkDups: continue # skip duplicate
checkDups.add(line)
f_out.write(line)
You are currently writing a list (items) to the csv which is why it is in brackets. To avoid this, use another for loop that could look like this:
for title in titles:
comb = []
Name = title.xpath('.//span[#itemprop="name"]/text()')
Address = title.xpath('.//span[#itemprop="streetAddress" and #class="street-address"]/text()')
Phone = title.xpath('.//div[#itemprop="telephone" and #class="phones phone primary"]/text()')
if Name:
Name = Name[0]
if Address:
Address = Address[0]
if Phone:
Phone = Phone[0]
comb.append(Name)
comb.append(Address)
comb.append(Phone)
print comb
items.append(comb)
pg+=1
for item in items:
writer.writerow(item)
parsingdata(3)
This should write each item separately to your csv. It turns out the items you were appending to comb were lists themselves, so this extracts them.
And the concise version of this scraper I found lately is:
import csv
import requests
from lxml import html
url = "https://www.yellowpages.com/search?search_terms=Coffee%20Shops&geo_location_terms=Los%20Angeles%2C%20CA&page={0}"
def parsingdata(link):
outfile=open('YellowPage.csv','w',newline='')
writer=csv.writer(outfile)
writer.writerow(["Name","Address","Phone"])
for page_link in [link.format(i) for i in range(1, 4)]:
page = requests.get(page_link).text
tree = html.fromstring(page)
for title in tree.xpath('//div[#class="info"]'):
Name = title.findtext('.//span[#itemprop="name"]')
Address = title.findtext('.//span[#itemprop="streetAddress"]')
Phone = title.findtext('.//div[#itemprop="telephone"]')
print([Name, Address, Phone])
writer.writerow([Name, Address, Phone])
parsingdata(url)

How to bypass IndexError

Here is my situation: My code parses out data from HTML tables that are within emails. The roadblock I'm running into is that some of these tables have blank empty rows right in the middle of the table, as seen in the photo below. This blank space causes my code to fail (IndexError: list index out of range) since it attempts to extract text from the cells.
Is it possible to say to Python: "ok, if you run into this error that comes from these blank rows, just stop there and take the rows you have acquired text from so far and execute the rest of the code on those"...?
That might sound like a dumb solution to this problem but my project involves me taking data from only the most recent date in the table anyway, which is always amongst the first few rows, and always before these blank empty rows.
So if it is possible to say "if you hit this error, just ignore it and proceed" then I would like to learn how to do that. If it's not then I'll have to figure out another way around this. Thanks for any and all help.
The table with the gap:
My code:
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import numpy as np
import os
import re
import email
import cx_Oracle
dsnStr = cx_Oracle.makedsn("sole.nefsc.noaa.gov", "1526", "sole")
con = cx_Oracle.connect(user="user", password="password", dsn=dsnStr)
def celltext(cell):
'''
textlist=[]
for br in cell.findAll('br'):
next = br.nextSibling
if not (next and isinstance(next,NavigableString)):
continue
next2 = next.nextSibling
if next2 and isinstance(next2,Tag) and next2.name == 'br':
text = str(next).strip()
if text:
textlist.append(next)
return (textlist)
'''
textlist=[]
y = cell.find('span')
for a in y.childGenerator():
if isinstance(a, NavigableString):
textlist.append(str(a))
return (textlist)
path = 'Z:\\blub_2'
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
html=open(file_path,'r').read()
soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
table = soup.find_all('table')[1] # Grab the second table
df_Quota = pd.DataFrame()
for row in table.find_all('tr'):
columns = row.find_all('td')
if columns[0].get_text().strip()!='ID': # skip header
Quota = celltext(columns[1])
Weight = celltext(columns[2])
price = celltext(columns[3])
print(Quota)
Nrows= max([len(Quota),len(Weight),len(price)]) #get the max number of rows
IDList = [columns[0].get_text()] * Nrows
DateList = [columns[4].get_text()] * Nrows
if price[0].strip()=='Package':
price = [columns[3].get_text()] * Nrows
if len(Quota)<len(Weight):#if Quota has less itmes extend with NaN
lstnans= [np.nan]*(len(Weight)-len(Quota))
Quota.extend(lstnans)
if len(price) < len(Quota): #if price column has less items than quota column,
val = [columns[3].get_text()] * (len(Quota)-len(price)) #extend with
price.extend(val) #whatever is in
#price column
#if len(DateList) > len(Quota): #if DateList is longer than Quota,
#print("it's longer than")
#value = [columns[4].get_text()] * (len(DateList)-len(Quota))
#DateList = value * Nrows
if len(Quota) < len(DateList): #if Quota is less than DateList (due to gap),
stu = [np.nan]*(len(DateList)-len(Quota)) #extend with NaN
Quota.extend(stu)
if len(Weight) < len(DateList):
dru = [np.nan]*(len(DateList)-len(Weight))
Weight.extend(dru)
FinalDataframe = pd.DataFrame(
{
'ID':IDList,
'AvailableQuota': Quota,
'LiveWeightPounds': Weight,
'price':price,
'DatePosted':DateList
})
df_Quota = df_Quota.append(FinalDataframe, ignore_index=True)
#df_Quota = df_Quota.loc[df_Quota['DatePosted']=='5/20']
df_Q = df_Quota['DatePosted'].iloc[0]
df_Quota = df_Quota[df_Quota['DatePosted'] == df_Q]
print (df_Quota)
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
with open(file_path, 'r') as f:
pattern = re.compile(r'Sent:.*?\b(\d{4})\b')
email = f.read()
dates = pattern.findall(email)
if dates:
print("Date:", ''.join(dates))
#cursor = con.cursor()
#exported_data = [tuple(x) for x in df_Quota.values]
#sql_query = ("INSERT INTO ROUGHTABLE(species, date_posted, stock_id, pounds, money, sector_name, ask)" "VALUES (:1, :2, :3, :4, :5, 'NEFS 2', '1')")
#cursor.executemany(sql_query, exported_data)
#con.commit()
#cursor.close()
#con.close()
continue is the keyword to use for skipping empty/problem rows. IndexError is thanks to the attempt to access columns[0] on an empty columns list. So just skip to next row when there is an exception.
for row in table.find_all('tr'):
columns = row.find_all('td')
try:
if columns[0].get_text().strip()!='ID':
# Rest as above in original code.
except IndexError:
continue
Use try: ... except: ...:
try:
#extract data from table
except IndexError:
#execute rest of program

Python Pandas - Read csv file containing multiple tables

I have a single .csv file containing multiple tables.
Using Pandas, what would be the best strategy to get two DataFrame inventory and HPBladeSystemRack from this one file ?
The input .csv looks like this:
Inventory
System Name IP Address System Status
dg-enc05 Normal
dg-enc05_vc_domain Unknown
dg-enc05-oa1 172.20.0.213 Normal
HP BladeSystem Rack
System Name Rack Name Enclosure Name
dg-enc05 BU40
dg-enc05-oa1 BU40 dg-enc05
dg-enc05-oa2 BU40 dg-enc05
The best I've come up with so far is to convert this .csv file into Excel workbook (xlxs), split the tables into sheets and use:
inventory = read_excel('path_to_file.csv', 'sheet1', skiprow=1)
HPBladeSystemRack = read_excel('path_to_file.csv', 'sheet2', skiprow=2)
However:
This approach requires xlrd module.
Those log files have to be analyzed in real time, so that it would be way better to find a way to analyze them as they come from the logs.
The real logs have far more tables than those two.
If you know the table names beforehand, then something like this:
df = pd.read_csv("jahmyst2.csv", header=None, names=range(3))
table_names = ["Inventory", "HP BladeSystem Rack", "Network Interface"]
groups = df[0].isin(table_names).cumsum()
tables = {g.iloc[0,0]: g.iloc[1:] for k,g in df.groupby(groups)}
should work to produce a dictionary with keys as the table names and values as the subtables.
>>> list(tables)
['HP BladeSystem Rack', 'Inventory']
>>> for k,v in tables.items():
... print("table:", k)
... print(v)
... print()
...
table: HP BladeSystem Rack
0 1 2
6 System Name Rack Name Enclosure Name
7 dg-enc05 BU40 NaN
8 dg-enc05-oa1 BU40 dg-enc05
9 dg-enc05-oa2 BU40 dg-enc05
table: Inventory
0 1 2
1 System Name IP Address System Status
2 dg-enc05 NaN Normal
3 dg-enc05_vc_domain NaN Unknown
4 dg-enc05-oa1 172.20.0.213 Normal
Once you've got that, you can set the column names to the first rows, etc.
I assume you know the names of the tables you want to parse out of the csv file. If so, you could retrieve the index positions of each, and select the relevant slices accordingly. As a sketch, this could look like:
df = pd.read_csv('path_to_file')
index_positions = []
for table in table_names:
index_positions.append(df[df['col_with_table_names']==table].index.tolist()[0])
## Include end of table for last slice, omit for iteration below
index_positions.append(df.index.tolist()[-1])
tables = {}
for position in index_positions[:-1]:
table_no = index_position.index(position)
tables[table_names[table_no] = df.loc[position:index_positions[table_no+10]]
There are certainly more elegant solutions but this should give you a dictionary with the table names as keys and the corresponding tables as values.
Pandas doesn't seem to be ready to do this easily, so I ended up doing my own split_csv function. It only requires table names and will output .csv files named after each table.
import csv
from os.path import dirname # gets parent folder in a path
from os.path import join # concatenate paths
table_names = ["Inventory", "HP BladeSystem Rack", "Network Interface"]
def split_csv(csv_path, table_names):
tables_infos = detect_tables_from_csv(csv_path, table_names)
for table_info in tables_infos:
split_csv_by_indexes(csv_path, table_info)
def split_csv_by_indexes(csv_path, table_info):
title, start_index, end_index = table_info
print title, start_index, end_index
dir_ = dirname(csv_path)
output_path = join(dir_, title) + ".csv"
with open(output_path, 'w') as output_file, open(csv_path, 'rb') as input_file:
writer = csv.writer(output_file)
reader = csv.reader(input_file)
for i, line in enumerate(reader):
if i < start_index:
continue
if i > end_index:
break
writer.writerow(line)
def detect_tables_from_csv(csv_path, table_names):
output = []
with open(csv_path, 'rb') as csv_file:
reader = csv.reader(csv_file)
for idx, row in enumerate(reader):
for col in row:
match = [title for title in table_names if title in col]
if match:
match = match[0] # get the first matching element
try:
end_index = idx - 1
start_index
except NameError:
start_index = 0
else:
output.append((previous_match, start_index, end_index))
print "Found new table", col
start_index = idx
previous_match = match
match = False
end_index = idx # last 'end_index' set to EOF
output.append((previous_match, start_index, end_index))
return output
if __name__ == '__main__':
csv_path = 'switch_records.csv'
try:
split_csv(csv_path, table_names)
except IOError as e:
print "This file doesn't exist. Aborting."
print e
exit(1)

Categories

Resources