Python CSV output with Headers

Python CSV output with Headers - python

I'm trying to save information about traffic from an API to a CSV file using python.
I want the data to go underneath the tag names in a csv output. This is looping every 10 minutes as I want to get a graph / historic data of how bad a specific road stretch is.
It would look like this basically,
Traffic Delay - Length In Meters - Departure Time - etc
Data at 0time - Data at 0time - Data at 0time - etc
Data at10time - Data at10time - Data at10time - etc
In this example the headings are traffic travel time, length in meters, departure time and traffic delay in seconds.
And the script i'm currently using is below. I've had a go at making a separate dictionary for the header but am in struggle town collating it properly. Any python guru's able to guide this noobie?
from lxml import etree
import urllib.request
import csv
import time
#append to list next
def handleSummary(summary):
tagsOfInterest=["noTrafficTravelTimeInSeconds", "lengthInMeters", "departureTime", "trafficDelayInSeconds"] # whatever
#list to use for data analysis
global data
global header
header = tagsOfInterest
data = []
#create header dictionary that includes the data to be appended within it. IE, Header = {TrafficDelay[data(0)]...etc
for child in summary:
if 'summary' in child.tag:
for elem in child:
for item in tagsOfInterest:
if item in elem.tag:
data.append(elem.text)
#Parse the xml
def parseXML(xmlFile):
while True:
with urllib.request.urlopen("https://api.tomtom.com/routing/1/calculateRoute/-37.79205923474775,145.03010268799338:-37.798883995180496,145.03040309540322:-37.807106781970354,145.02895470253526:-37.80320743019992,145.01021142594075:-37.7999012967757,144.99318476311566:?routeType=shortest&key=notakey&computeTravelTimeFor=all") as fobj:
xml = fobj.read()
root = etree.fromstring(xml)
for child in root:
if 'route' in child.tag:
handleSummary(child)
# Write CSV file
with open('datafile.csv', 'a') as fp:
writer = csv.writer(fp, delimiter=' ')
# writer.writerow(["your", "header", "foo"]) # write header
writer.writerow(data)
with open('datafile.csv', 'r') as fp:
reader = csv.reader(fp, quotechar='"')
# next(reader, None) # skip the headers
data_read = [row for row in reader]
print(data_read)
time.sleep(600)
if __name__ == "__main__":
parseXML("xmlFile")
Thanks Stack!

Related

Combine two python scripts for web search

I'm trying to download files from a site and due to search result limitations (max 300), I need to search each item individually. I have a csv file that has a complete list which I've written some basic code to return the ID# column.
With some help, I've got another script that iterates through each search result and downloads a file. What I need to do now is to combine the two so that it will search each individual ID# and download the file.
I know my loop is messed up here, I just can't figure out where and if I'm even looping in the right order
import requests, json, csv
faciltiyList = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
faciltiyList.append(searchterm[0])
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
r = requests.get(url+"?term="+str(searchterm))
searchresults = json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The original code I have is here:
import requests, json
searchterm="ALAMEDA (COUNTY)"
url="https://siera.oshpd.ca.gov/FindFacility.aspx"
r=requests.get(url+"?term="+searchterm)
searchresults=json.loads(r.content.decode('utf-8'))
for report in searchresults:
rpt_id=report['RPT_ID']
reporturl=f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r=requests.get(reporturl)
a=r.headers['Content-Disposition']
filename=a[a.find("filename=")+9:len(a)]
file = open(filename, "wb")
file.write(r.content)
r.close()
The searchterm ="ALAMEDA (COUNTY)" results in more than 300 results, so I'm trying to replace "ALAMEDA (COUNTY)" with a list that'll run through each name (ID# in this case) so that I'll get just one result, then run again for the next on the list

CSV - just 1 line
Tested with a CSV file with just 1 line:
406014324,"HOLISTIC PALLIATIVE CARE, INC.",550004188,Parent Facility,5707 REDWOOD RD,OAKLAND,94619,1,ALAMEDA,Not Applicable,,Open,1/1/2018,Home Health Agency/Hospice,Hospice,37.79996,-122.17075
Python code
This script reads the IDs from the CSV file. Then, it fetches the results from URL and finally writes the desired contents to the disk.
import requests, json, csv
# read Ids from csv
facilityIds = []
with open('Facility List.csv', 'r') as f:
csv_reader = csv.reader(f, delimiter=',')
for searchterm in csv_reader:
facilityIds.append(searchterm[0])
# fetch and write file contents
url = "https://siera.oshpd.ca.gov/FindFacility.aspx"
for facilityId in facilityIds:
r = requests.get(url+"?term="+str(facilityId))
reports = json.loads(r.content.decode('utf-8'))
# print(f"reports = {reports}")
for report in reports:
rpt_id = report['RPT_ID']
reporturl = f"https://siera.oshpd.ca.gov/DownloadPublicFile.aspx?archrptsegid={rpt_id}&reporttype=58&exportformatid=8&versionid=1&pageid=1"
r = requests.get(reporturl)
a = r.headers['Content-Disposition']
filename = a[a.find("filename=")+9:len(a)]
# print(f"filename = {filename}")
with open(filename, "wb") as o:
o.write(r.content)
Repl.it link

How to convert XML to table?

I'm taking an XML string from a URL. This part works fine.
REQUEST_URL = 'https://URL'
response = requests.get(REQUEST_URL, auth=(login, password))
xml_data = response.text.encode('utf-8', 'ignore')
tree = ET.parse(xml_data)
root = tree.getroot()
print(response.text) gives me:
<?xml version='1.0' standalone='yes'?><Report Type='SLA Report'
SiteName='Execute Query'
SLA_Name='Execute Query'
SLA_Description='Execute Query'
From='2018-11-27 00:00'
Thru='2018-11-27 23:59'
obj_device='4500'
locations='69,31,'
>
<Objective Type='Availability'>
<Goal>99.93</Goal>
<Actual>99.93</Actual>
<Compliant>Yes</Compliant>
<Errors>2</Errors>
<Checks>2878</Checks>
</Objective>
<Objective Type='Uptime'>
<Goal></Goal>
<Actual></Actual>
<Compliant></Compliant>
<Errors>0</Errors>
<Checks>0</Checks>
</Objective>
<Objective Type='Response Time'>
<Goal>300.00</Goal>
<Actual>3.1164</Actual>
<Compliant>Yes</Compliant>
<Errors>0</Errors>
<Checks>2878</Checks>
</Objective>
<MonitoringPeriods>
<Monitor>
<Exclude>No</Exclude><DayFrom>Sunday</DayFrom><TimeFrom>00:00</TimeFrom><DayThru>Sunday</DayThru><TimeThru>23:59</TimeThru>
</Monitor>
I'd like to get the data into a table so it's easier to work with. How can I do this with Python 3.x? When I import it into Excel, it looks great.
It may be something like this:
for sla in root.findall('Objective'):
goal = sla.find('Goal').text
actual = sla.find('Actual').text
compliant = sla.find('Compliant').text
errors = sla.find('Errors').text
checks = sla.find('Checks').text
print('Goal:', goal, 'Actual:', actual, 'Compliant:', compliant, 'Errors:', errors, 'Checks:', checks)
But I want to load each data point into a data frame, not print each data point. How can I do the same thing using Python? TIA.

Sould print this:
# importing csv module
import csv
# csv file name
filename = "aapl.csv"
# initializing the titles and rows list
fields = []
rows = []
# reading csv file
with open(filename, 'r') as csvfile:
# creating a csv reader object
csvreader = csv.reader(csvfile)
# extracting field names through first row
fields = csvreader.next()
# extracting each data row one by one
for row in csvreader:
rows.append(row)
# get total number of rows
print("Total no. of rows: %d"%(csvreader.line_num))
# printing the field names
print('Field names are:' + ', '.join(field for field in fields))
# printing first 5 rows
print('\nFirst 5 rows are:\n')
for row in rows[:5]:
# parsing each column of a row
for col in row:
print("%10s"%col),
print('\n')
Source: https://www.geeksforgeeks.org/working-csv-files-python/

Write data into csv

I am crawling data from Wikipedia and it works so far. I can display it on the terminal, but I can't write it the way I need it into a csv file :-/
The code is pretty long, but I paste it here anyway and hope that somebody can help me.
import csv
import requests
from bs4 import BeautifulSoup
def spider():
url = 'https://de.wikipedia.org/wiki/Liste_der_Gro%C3%9F-_und_Mittelst%C3%A4dte_in_Deutschland'
code = requests.get(url).text # Read source code and make unicode
soup = BeautifulSoup(code, "lxml") # create BS object
table = soup.find(text="Rang").find_parent("table")
for row in table.find_all("tr")[1:]:
partial_url = row.find_all('a')[0].attrs['href']
full_url = "https://de.wikipedia.org" + partial_url
get_single_item_data(full_url) # goes into the individual sites
def get_single_item_data(item_url):
page = requests.get(item_url).text # Read source code & format with .text to unicode
soup = BeautifulSoup(page, "lxml") # create BS object
def getInfoBoxBasisDaten(s):
return str(s) == 'Basisdaten' and s.parent.name == 'th'
basisdaten = soup.find_all(string=getInfoBoxBasisDaten)[0]
basisdaten_list = ['Bundesland', 'Regierungsbezirk:', 'Höhe:', 'Fläche:', 'Einwohner:', 'Bevölkerungsdichte:',
'Postleitzahl', 'Vorwahl:', 'Kfz-Kennzeichen:', 'Gemeindeschlüssel:', 'Stadtgliederung:',
'Adresse', 'Anschrift', 'Webpräsenz:', 'Website:', 'Bürgermeister', 'Bürgermeisterin',
'Oberbürgermeister', 'Oberbürgermeisterin']
with open('staedte.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Bundesland', 'Regierungsbezirk:', 'Höhe:', 'Fläche:', 'Einwohner:', 'Bevölkerungsdichte:',
'Postleitzahl', 'Vorwahl:', 'Kfz-Kennzeichen:', 'Gemeindeschlüssel:', 'Stadtgliederung:',
'Adresse', 'Anschrift', 'Webpräsenz:', 'Website:', 'Bürgermeister', 'Bürgermeisterin',
'Oberbürgermeister', 'Oberbürgermeisterin']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, delimiter=';', quotechar='|', quoting=csv.QUOTE_MINIMAL, extrasaction='ignore')
writer.writeheader()
for i in basisdaten_list:
wanted = i
current = basisdaten.parent.parent.nextSibling
while True:
if not current.name:
current = current.nextSibling
continue
if wanted in current.text:
items = current.findAll('td')
print(BeautifulSoup.get_text(items[0]))
print(BeautifulSoup.get_text(items[1]))
writer.writerow({i: BeautifulSoup.get_text(items[1])})
if '<th ' in str(current): break
current = current.nextSibling
print(spider())
The output is incorrect in 2 ways. The cells are their right places and only one city is written, all others are missing. It looks like this:
But it should look like this + all other cities in it:

'... only one city is written ...': You call get_single_item_data for each city. Then inside this function you open the output file with the same name, in the statement with open('staedte.csv', 'w', newline='', encoding='utf-8') as csvfile: which will overwrite the output file each time you call the function.
Each variable is written to a new row: In the statement writer.writerow({i: BeautifulSoup.get_text(items[1])}) you write the value for one variable to a row. What you need to do instead is to make a dictionary for values before you start looking for page values. As you accumulate the values from the page you shove them into the dictionary by field name. Then after you have found all of the values available you call writer.writerow.

Scraping needs to stop when it sees a white row

from bs4 import BeautifulSoup
import urllib
import json
import os
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989"), str("1986"), str("1982"), str("1981"), str("1977"), str("1972"), str("1971"), str("1967"), str("1963"), str("1959"), str("1956")]
DESIRED_COLUMNS = {1, 2, 5} #scrapes only afk, aantal & zetels
verkiezingsData = []
filename = raw_input('Enter a filename: ') or 'data.json'
#open file and open json array
with open(filename, "w") as file:
file.write("[{")
for Jaargetal in jaren:
#url source
r = urllib.urlopen("http://www.nlverkiezingen.com/TK" + Jaargetal +".html").read()
soup = BeautifulSoup(r, "html.parser")
tables = soup.find_all("table")
for table in tables:
header = soup.find_all("h1")[0].getText()
#print header
with open(filename, "a+") as file:
file.write("\"%s\": [" % header) #header as beginning json
trs = table.find_all("tr")[0].getText()
del verkiezingsData[:] #clear list before adding new data
#add the 3 columns to a list
for tr in table.find_all("tr")[1:22]: #22 columns top till bottom
for index, val in enumerate(tr.find_all('td')):
if index in DESIRED_COLUMNS: #linkt naar desired columns bovenin
verkiezingsData.append(val.getText().strip())
#json array van de 3 vallues
for a, b, c in zip(verkiezingsData[::3], verkiezingsData[1::3], verkiezingsData[2::3]): #link naar desired columns 1,2,5
data2 = {'afk':a,"aantal":b, "zetels":c}
#file writing
with open(filename, 'a') as outfile:
json.dump(data2, outfile)
outfile.write(",")
#open file, delete last comma and close array
with open(filename, 'ab+') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("],")
#open file, delete last comma, and close array
with open(filename, 'r+b') as file:
file.seek(-1, os.SEEK_END)
file.truncate()
file.write("}]")
#open file and pretty print json data
with open(filename, 'r') as file:
prettydata = json.load(file)
with open(filename, 'w') as file:
json.dump(prettydata, file, sort_keys=True, indent=4, separators=(',', ': '))
I made a scraper which scrapes from nlverkiezingen.com
It scrapes Aantal/Afk/Zetels
It has a string which scrapes a lot of years.
jaren = [str("2012"), str("2010"), str("2006"), str("2003"),str("2002"), str("1998"), str("1994"), str("1989")]
It needs to begin at row 1.
for tr in table.find_all("tr")[1:22]: #22 columns top till bottom
But needs to end when it sees a white line/white space instead of row 22^. (every year ends on another row) How can I code this?
Or is it possible to say somewhere in the code - about every year in the string - when it needs to stop scraping rows? for example 2010 row 22 1959 row 10

I will assume here that you are asking that you want to break out of the for loop when tr dom is empty.
for tr in table.find_all("tr")[1:22]:
if tr.getText() == "":
break

If you need to stop the iteration if a specific column like 'aantal' is empty then maybe try something like this:
# add the 3 columns to a list
for tr in table.find_all("tr")[1:22]: # 22 columns top till bottom
td_cols = tr.find_all('td')
if td_cols[2].getText() == '': # second column is aantal, if empty break loop
break
# else go on as usual
for index, val in enumerate(td_cols):
# do stuff
However, I believe you should try structure your code differently so that you have more control over the flow.

extracting data from CSV file using a reference

I have a csv file with several hundred organism IDs and a second csv file with several thousand organism IDs and additional characteristics (taxonomic information, abundances per sample, etc)
I am trying to write a code that will extract the information from the larger csv using the smaller csv file as a reference. Meaning it will look at both smaller and larger files, and if the IDs are in both files, it will extract all the information form the larger file and write that in a new file (basically write the entire row for that ID).
so far I have written the following, and while the code does not error out on me, I get a blank file in the end and I don't exactly know why. I am a graduate student that knows some simple coding but I'm still very much a novice,
thank you
import sys
import csv
import os.path
SparCCnames=open(sys.argv[1],"rU")
OTU_table=open(sys.argv[2],"rU")
new_file=open(sys.argv[3],"w")
Sparcc_OTUs=csv.writer(new_file)
d=csv.DictReader(SparCCnames)
ids=csv.DictReader(OTU_table)
for record in ids:
idstopull=record["OTUid"]
if idstopull[0]=="OTUid":
continue
if idstopull[0] in d:
new_id.writerow[idstopull[0]]
SparCCnames.close()
OTU_table.close()
new_file.close()

I'm not sure what you're trying to do in your code but you can try this:
def csv_to_dict(csv_file_path):
csv_file = open(csv_file_path, 'rb')
csv_file.seek(0)
sniffdialect = csv.Sniffer().sniff(csv_file.read(10000), delimiters='\t,;')
csv_file.seek(0)
dict_reader = csv.DictReader(csv_file, dialect=sniffdialect)
csv_file.seek(0)
dict_data = []
for record in dict_reader:
dict_data.append(record)
csv_file.close()
return dict_data
def dict_to_csv(csv_file_path, dict_data):
csv_file = open(csv_file_path, 'wb')
writer = csv.writer(csv_file, dialect='excel')
headers = dict_data[0].keys()
writer.writerow(headers)
# headers must be the same with dat.keys()
for dat in dict_data:
line = []
for field in headers:
line.append(dat[field])
writer.writerow(line)
csv_file.close()
if __name__ == "__main__":
big_csv = csv_to_dict('/path/to/big_csv_file.csv')
small_csv = csv_to_dict('/path/to/small_csv_file.csv')
output = []
for s in small_csv:
for b in big_csv:
if s['id'] == b['id']:
output.append(b)
if output:
dict_to_csv('/path/to/output.csv', output)
else:
print "Nothing."
Hope that will help.

You need to read the data into a data structure, assuming OTUid is unique you can store this into a dictionary for fast lookup:
with open(sys.argv[1],"rU") as SparCCnames:
d = csv.DictReader(SparCCnames)
fieldnames = d.fieldnames
data = {i['OTUid']: i for i in d}
with open(sys.argv[2],"rU") as OTU_table, open(sys.argv[3],"w") as new_file:
Sparcc_OTUs = csv.DictWriter(new_file, fieldnames)
ids = csv.DictReader(OTU_table)
for record in ids:
if record['OTUid'] in data:
Sparcc_OTUs.writerow(data[record['OTUid']])

Thank you everyone for your help. I played with things and consulted with an advisor, and finally got a working script. I am posting it in case it helps someone else in the future.
Thanks!
import sys
import csv
input_file = csv.DictReader(open(sys.argv[1], "rU")) #has all info
ref_list = csv.DictReader(open(sys.argv[2], "rU")) #reference list
output_file = csv.DictWriter(
open(sys.argv[3], "w"), input_file.fieldnames) #to write output file with headers
output_file.writeheader() #write headers in output file
white_list={} #create empty dictionary
for record in ref_list: #for every line in my reference list
white_list[record["Sample_ID"]] = None #store into the dictionary the ID's as keys
for record in input_file: #for every line in my input file
record_id = record["Sample_ID"] #store ID's into variable record_id
if (record_id in white_list): #if the ID is in the reference list
output_file.writerow(record) #write the entire row into a new file
else: #if it is not in my reference list
continue #ignore it and continue iterating through the file

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python CSV output with Headers - python

Related

Combine two python scripts for web search

How to convert XML to table?

Write data into csv

Scraping needs to stop when it sees a white row

extracting data from CSV file using a reference

Categories

Resources