How to convert XML to table? - python

I'm taking an XML string from a URL. This part works fine.
REQUEST_URL = 'https://URL'
response = requests.get(REQUEST_URL, auth=(login, password))
xml_data = response.text.encode('utf-8', 'ignore')
tree = ET.parse(xml_data)
root = tree.getroot()
print(response.text) gives me:
<?xml version='1.0' standalone='yes'?><Report Type='SLA Report'
SiteName='Execute Query'
SLA_Name='Execute Query'
SLA_Description='Execute Query'
From='2018-11-27 00:00'
Thru='2018-11-27 23:59'
obj_device='4500'
locations='69,31,'
>
<Objective Type='Availability'>
<Goal>99.93</Goal>
<Actual>99.93</Actual>
<Compliant>Yes</Compliant>
<Errors>2</Errors>
<Checks>2878</Checks>
</Objective>
<Objective Type='Uptime'>
<Goal></Goal>
<Actual></Actual>
<Compliant></Compliant>
<Errors>0</Errors>
<Checks>0</Checks>
</Objective>
<Objective Type='Response Time'>
<Goal>300.00</Goal>
<Actual>3.1164</Actual>
<Compliant>Yes</Compliant>
<Errors>0</Errors>
<Checks>2878</Checks>
</Objective>
<MonitoringPeriods>
<Monitor>
<Exclude>No</Exclude><DayFrom>Sunday</DayFrom><TimeFrom>00:00</TimeFrom><DayThru>Sunday</DayThru><TimeThru>23:59</TimeThru>
</Monitor>
I'd like to get the data into a table so it's easier to work with. How can I do this with Python 3.x? When I import it into Excel, it looks great.
It may be something like this:
for sla in root.findall('Objective'):
goal = sla.find('Goal').text
actual = sla.find('Actual').text
compliant = sla.find('Compliant').text
errors = sla.find('Errors').text
checks = sla.find('Checks').text
print('Goal:', goal, 'Actual:', actual, 'Compliant:', compliant, 'Errors:', errors, 'Checks:', checks)
But I want to load each data point into a data frame, not print each data point. How can I do the same thing using Python? TIA.

Sould print this:
# importing csv module
import csv
# csv file name
filename = "aapl.csv"
# initializing the titles and rows list
fields = []
rows = []
# reading csv file
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
for row in csvreader:
rows.append(row)
# get total number of rows
print("Total no. of rows: %d"%(csvreader.line_num))
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
# printing first 5 rows
print('\nFirst 5 rows are:\n')
for row in rows[:5]:
# parsing each column of a row
for col in row:
print("%10s"%col),
print('\n')
Source: https://www.geeksforgeeks.org/working-csv-files-python/

Related

How to iterate through all rows and save the data

I am working on a Python script to do some steps.
Right now everything is working except the part that it has to go through all table rows and save each line as a JSON object. The problem is: It only saves the last line. So it's not saving the previous lines. I know where is the problem but don't know how to fix.
Here is the code:
url = 'http://website.com/group1.html'
htmlFile= urlopen(urlGroup1)
soup= BeautifulSoup(htmlGroup1, 'html.parser')
table = soup2.find_all("table", {"class": "sortable employeeList result-table"})[0]
rows = table.find_all('tr')
Filegroup1 = open('group1.csv', 'wt+')
Datagroup1 = csv.writer(Filegroup1)
jsonFilePath = r'group1.json'
def make_json(Filegroup1, jsonFilePath):
data2 = {}
with open('group1.csv', encoding='utf-8') as csvf:
csvReader = csv.DictReader(csvf)
for rows in csvReader:
data2 = rows
with open(jsonFilePath, 'w', encoding='utf-8') as jsonf:
jsonf.write(simplejson.dumps([data2], indent=4))
try:
for row in rows:
FilteredRow = []
for cell in row.find_all(['td', 'th']):
FilteredRow.append(cell.get_text().strip())
Datagroup1.writerow(FilteredRow)
finally:
Filegroup1.close()
make_json(Filegroup1, jsonFilePath)
The issue is here:
for rows in csvReader:
data2 = rows
If I change it to the following, it will work!! But it will group each object by Employee ID. Which I don't want that.
for rows in csvReader:
key = rows['Employee Name']
data3[key] = rows

Python CSV output with Headers

I'm trying to save information about traffic from an API to a CSV file using python.
I want the data to go underneath the tag names in a csv output. This is looping every 10 minutes as I want to get a graph / historic data of how bad a specific road stretch is.
It would look like this basically,
Traffic Delay - Length In Meters - Departure Time - etc
Data at 0time - Data at 0time - Data at 0time - etc
Data at10time - Data at10time - Data at10time - etc
In this example the headings are traffic travel time, length in meters, departure time and traffic delay in seconds.
And the script i'm currently using is below. I've had a go at making a separate dictionary for the header but am in struggle town collating it properly. Any python guru's able to guide this noobie?
from lxml import etree
import urllib.request
import csv
import time
#append to list next
def handleSummary(summary):
tagsOfInterest=["noTrafficTravelTimeInSeconds", "lengthInMeters", "departureTime", "trafficDelayInSeconds"] # whatever
#list to use for data analysis
global data
global header
header = tagsOfInterest
data = []
#create header dictionary that includes the data to be appended within it. IE, Header = {TrafficDelay[data(0)]...etc
for child in summary:
if 'summary' in child.tag:
for elem in child:
for item in tagsOfInterest:
if item in elem.tag:
data.append(elem.text)
#Parse the xml
def parseXML(xmlFile):
while True:
with urllib.request.urlopen("https://api.tomtom.com/routing/1/calculateRoute/-37.79205923474775,145.03010268799338:-37.798883995180496,145.03040309540322:-37.807106781970354,145.02895470253526:-37.80320743019992,145.01021142594075:-37.7999012967757,144.99318476311566:?routeType=shortest&key=notakey&computeTravelTimeFor=all") as fobj:
xml = fobj.read()
root = etree.fromstring(xml)
for child in root:
if 'route' in child.tag:
handleSummary(child)
# Write CSV file
with open('datafile.csv', 'a') as fp:
writer = csv.writer(fp, delimiter=' ')
# writer.writerow(["your", "header", "foo"]) # write header
writer.writerow(data)
with open('datafile.csv', 'r') as fp:
reader = csv.reader(fp, quotechar='"')
# next(reader, None) # skip the headers
data_read = [row for row in reader]
print(data_read)
time.sleep(600)
if __name__ == "__main__":
parseXML("xmlFile")
Thanks Stack!

Txt file to excel conversion in python

I'm trying to convert text file to excel sheet in python. The txt file contains data in the below specified formart
Column names: reg no, zip code, loc id, emp id, lastname, first name. Each record has one or more error numbers. Each record have their column names listed above the values. I would like to create an excel sheet containing reg no, firstname, lastname and errors listed in separate rows for each record.
How can I put the records in excel sheet ? Should I be using regular expressions ? And how can I insert error numbers in different rows for that corresponding record?
Expected output:
Here is the link to the input file:
https://github.com/trEaSRE124/Text_Excel_python/blob/master/new.txt
Any code snippets or suggestions are kindly appreciated.
Here is a draft code. Let me know if any changes needed:
# import pandas as pd
from collections import OrderedDict
from datetime import date
import csv
with open('in.txt') as f:
with open('out.csv', 'wb') as csvfile:
spamwriter = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_MINIMAL)
#Remove inital clutter
while("INPUT DATA" not in f.readline()):
continue
header = ["REG NO", "ZIP CODE", "LOC ID", "EMP ID", "LASTNAME", "FIRSTNAME", "ERROR"]; data = list(); errors = list()
spamwriter.writerow(header)
print header
while(True):
line = f.readline()
errors = list()
if("END" in line):
exit()
try:
int(line.split()[0])
data = line.strip().split()
f.readline() # get rid of \n
line = f.readline()
while("ERROR" in line):
errors.append(line.strip())
line = f.readline()
spamwriter.writerow(data + errors)
spamwriter.flush()
except:
continue
# while(True):
# line = f.readline()
Use python-2 to run. The errors are appended as subsequent columns. It's slightly complicated the way you want it. I can fix it if still needed
Output looks like:
You can do this using the openpyxl library which is capable of depositing items directly into a spreadsheet. This code shows how to do that for your particular situation.
NEW_PERSON, ERROR_LINE = 1,2
def Line_items():
with open('katherine.txt') as katherine:
for line in katherine:
line = line.strip()
if not line:
continue
items = line.split()
if items[0].isnumeric():
yield NEW_PERSON, items
elif items[:2] == ['ERROR', 'NUM']:
yield ERROR_LINE, line
else:
continue
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws['A2'] = 'REG NO'
ws['B2'] = 'LASTNAME'
ws['C2'] = 'FIRSTNAME'
ws['D2'] = 'ERROR'
row = 2
for kind, data in Line_items():
if kind == NEW_PERSON:
row += 2
ws['A{:d}'.format(row)] = int(data[0])
ws['B{:d}'.format(row)] = data[-2]
ws['C{:d}'.format(row)] = data[-1]
first = True
else:
if first:
first = False
else:
row += 1
ws['D{:d}'.format(row)] = data
wb.save(filename='katherine.xlsx')
This is a screen snapshot of the result.

Read CSV file and filter results

Im writing a script where one of its functions is to read a CSV file that contain URLs on one of its rows. Unfortunately the system that create those CSVs doesn't put double-quotes on values inside the URL column so when the URL contain commas it breaks all my csv parsing.
This is the code I'm using:
with open(accesslog, 'r') as csvfile, open ('results.csv', 'w') as enhancedcsv:
reader = csv.DictReader(csvfile)
for row in reader:
self.uri = (row['URL'])
self.OriCat = (row['Category'])
self.query(self.uri)
print self.URL+","+self.ServerIP+","+self.OriCat+","+self.NewCat"
This is a sample URL that is breaking up the parsing - this URL comes on the row named "URL". (note the commas at the end)
ams1-ib.adnxs.com/ww=1238&wh=705&ft=2&sv=43&tv=view5-1&ua=chrome&pl=mac&x=1468251839064740641,439999,v,mac,webkit_chrome,view5-1,0,,2,
The following row after the URL always come with a numeric value between parenthesis. Ex: (9999) so this could be used to define when the URL with commas end.
How can i deal with a situation like this using the csv module?
You will have to do it a little more manually. Try this
def process(lines, delimiter=','):
header = None
url_index_from_start = None
url_index_from_end = None
for line in lines:
if not header:
header = [l.strip() for l in line.split(delimiter)]
url_index_from_start = header.index('URL')
url_index_from_end = len(header)-url_index_from_start
else:
data = [l.strip() for l in line.split(delimiter)]
url_from_start = url_index_from_start
url_from_end = len(data)-url_index_from_end
values = data[:url_from_start] + data[url_from_end+1:] + [delimiter.join(data[url_from_start:url_from_end+1])]
keys = header[:url_index_from_start] + header[url_index_from_end+1:] + [header[url_index_from_start]]
yield dict(zip(keys, values))
Usage:
lines = ['Header1, Header2, URL, Header3',
'Content1, "Content2", abc,abc,,abc, Content3']
result = list(process(lines))
assert result[0]['Header1'] == 'Content1'
assert result[0]['Header2'] == '"Content2"'
assert result[0]['Header3'] == 'Content3'
assert result[0]['URL'] == 'abc,abc,,abc'
print(result)
Result:
>>> [{'URL': 'abc,abc,,abc', 'Header2': '"Content2"', 'Header3': 'Content3', 'Header1': 'Content1'}]
Have you considered using Pandas to read your data in?
Another possible solution would be to use regular expressions to pre-process the data...
#make a list of everything you want to change
old = re.findall(regex, f.read())
#append quotes and create a new list
new = []
for url in old:
url2 = "\""+url+"\""
new.append(url2)
#combine the lists
old_new = list(zip(old,new))
#Then use the list to update the file:
f = open(filein,'r')
filedata = f.read()
f.close()
for old,new in old_new:
newdata = filedata.replace(old,new)
f = open(filein,'w')
f.write(newdata)
f.close()

extracting data from CSV file using a reference

I have a csv file with several hundred organism IDs and a second csv file with several thousand organism IDs and additional characteristics (taxonomic information, abundances per sample, etc)
I am trying to write a code that will extract the information from the larger csv using the smaller csv file as a reference. Meaning it will look at both smaller and larger files, and if the IDs are in both files, it will extract all the information form the larger file and write that in a new file (basically write the entire row for that ID).
so far I have written the following, and while the code does not error out on me, I get a blank file in the end and I don't exactly know why. I am a graduate student that knows some simple coding but I'm still very much a novice,
thank you
import sys
import csv
import os.path
SparCCnames=open(sys.argv[1],"rU")
OTU_table=open(sys.argv[2],"rU")
new_file=open(sys.argv[3],"w")
Sparcc_OTUs=csv.writer(new_file)
d=csv.DictReader(SparCCnames)
ids=csv.DictReader(OTU_table)
for record in ids:
idstopull=record["OTUid"]
if idstopull[0]=="OTUid":
continue
if idstopull[0] in d:
new_id.writerow[idstopull[0]]
SparCCnames.close()
OTU_table.close()
new_file.close()
I'm not sure what you're trying to do in your code but you can try this:
def csv_to_dict(csv_file_path):
csv_file = open(csv_file_path, 'rb')
csv_file.seek(0)
sniffdialect = csv.Sniffer().sniff(csv_file.read(10000), delimiters='\t,;')
csv_file.seek(0)
dict_reader = csv.DictReader(csv_file, dialect=sniffdialect)
csv_file.seek(0)
dict_data = []
for record in dict_reader:
dict_data.append(record)
csv_file.close()
return dict_data
def dict_to_csv(csv_file_path, dict_data):
csv_file = open(csv_file_path, 'wb')
writer = csv.writer(csv_file, dialect='excel')
headers = dict_data[0].keys()
writer.writerow(headers)
# headers must be the same with dat.keys()
for dat in dict_data:
line = []
for field in headers:
line.append(dat[field])
writer.writerow(line)
csv_file.close()
if __name__ == "__main__":
big_csv = csv_to_dict('/path/to/big_csv_file.csv')
small_csv = csv_to_dict('/path/to/small_csv_file.csv')
output = []
for s in small_csv:
for b in big_csv:
if s['id'] == b['id']:
output.append(b)
if output:
dict_to_csv('/path/to/output.csv', output)
else:
print "Nothing."
Hope that will help.
You need to read the data into a data structure, assuming OTUid is unique you can store this into a dictionary for fast lookup:
with open(sys.argv[1],"rU") as SparCCnames:
d = csv.DictReader(SparCCnames)
fieldnames = d.fieldnames
data = {i['OTUid']: i for i in d}
with open(sys.argv[2],"rU") as OTU_table, open(sys.argv[3],"w") as new_file:
Sparcc_OTUs = csv.DictWriter(new_file, fieldnames)
ids = csv.DictReader(OTU_table)
for record in ids:
if record['OTUid'] in data:
Sparcc_OTUs.writerow(data[record['OTUid']])
Thank you everyone for your help. I played with things and consulted with an advisor, and finally got a working script. I am posting it in case it helps someone else in the future.
Thanks!
import sys
import csv
input_file = csv.DictReader(open(sys.argv[1], "rU")) #has all info
ref_list = csv.DictReader(open(sys.argv[2], "rU")) #reference list
output_file = csv.DictWriter(
open(sys.argv[3], "w"), input_file.fieldnames) #to write output file with headers
output_file.writeheader() #write headers in output file
white_list={} #create empty dictionary
for record in ref_list: #for every line in my reference list
white_list[record["Sample_ID"]] = None #store into the dictionary the ID's as keys
for record in input_file: #for every line in my input file
record_id = record["Sample_ID"] #store ID's into variable record_id
if (record_id in white_list): #if the ID is in the reference list
output_file.writerow(record) #write the entire row into a new file
else: #if it is not in my reference list
continue #ignore it and continue iterating through the file

Categories

Resources