Having trouble using BeautifulSoup to parse WeatherUnderground - python

I'm trying to adapt a code to extract information from wunderground. However the script that I'm trying to adapt was written in 2008 and the formating on weather underground has changed. I'm having trouble with soup.body.nobr.b.string. I want to extract daily percipitation data from a given site. http://www.wunderground.com/history/airport/KBUF/2011/5/2/DailyHistory.html
import urllib2
from BeautifulSoup import BeautifulSoup
# Create/open a file called wunder.txt (which will be a comma-delimited file)
f = open('wunder-data.txt', 'w')
# Iterate through year, month, and day
for y in range(1980, 2007):
for m in range(1, 13):
for d in range(1, 32):
# Check if leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
# Check if already gone through month
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
# Open wunderground.com url
url = "http://www.wunderground.com/history/airport/KBUF/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html"
page = urllib2.urlopen(url)
# Get temperature from page
soup = BeautifulSoup(page)
dayTemp = soup.body.nobr.b.string
# Format month for timestamp
if len(str(m)) < 2:
mStamp = '0' + str(m)
else:
mStamp = str(m)
# Format day for timestamp
if len(str(d)) < 2:
dStamp = '0' + str(d)
else:
dStamp = str(d)
# Build timestamp
timestamp = str(y) + mStamp + dStamp
# Write timestamp and temperature to file
f.write(timestamp + ',' + dayTemp + '\n')
# Done getting data! Close file.
f.close()

Don't mess with parsing the HTML, it'll likely change again without notice.
Get one of their CSV files (there are links at the bottom of the HTML pages), and parse it with the csv module.

Related

PDF splitting with Bookmarks in python through PyPDF4 - bookmarks are losing in the output

I am trying to create a script to split the pdf pages for the given page numbers/labels from the pdf, the script are producing the split pdf correctly, but few information are losing, and need to be correct
book mark is losing in the separated pdf, if original pdf contains bookmark
if pdf contains the page labels with Roman and arabic page numbers,
like prelims part start with i, ii, iii, iv ... then again main matter part start with Arabic number 1, 2 ,3... and so on, when passing the value of arabic number for split, it is splitting the prelim part (Roman page number), ie., (start = 5, end = 10 ), but it is splitting from (start = V, End = X)
how to correct the issue in the below script
MWE
import re
import regex
import sys
import os
from iPython.ErrorLog import *
from iPython.LaTeX_QC_validation import *
#from pdfrw import PdfReader, PdfWriter
from PyPDF4 import PdfFileWriter,PdfFileReader
from pdfrw import PdfReader, PdfWriter
from pagelabels import PageLabels, PageLabelScheme
pg_info = open('pageinfo.txt','r')
pgcnt=pg_info.read()
pg_info.close()
print(pgcnt)
pgcnt= re.sub(r'<Misc="([0-9]+)" StartPage="([^">].*)">\s*<Misc="(?:[0-9]+)" EndPage="([^">].*)"/>',r'<Misc="\1" StartPage="\2" EndPage="\3"/>',pgcnt,re.I | re.S| re.M)
print(pgcnt)
pno = []
def value(rno):
r = rno.upper()
if (r == 'I'):
return 1
if (r == 'V'):
return 5
if (r == 'X'):
return 10
if (r == 'L'):
return 50
if (r == 'C'):
return 100
if (r == 'D'):
return 500
if (r == 'M'):
return 1000
return -1
def romanToDecimal(str):
res = 0
i = 0
while (i < len(str)):
# Getting value of symbol s[i]
s1 = value(str[i])
if (i + 1 < len(str)):
# Getting value of symbol s[i + 1]
s2 = value(str[i + 1])
# Comparing both values
if (s1 >= s2):
# Value of current symbol is greater
# or equal to the next symbol
res = res + s1
i = i + 1
else:
# Value of current symbol is greater
# or equal to the next symbol
res = res + s2 - s1
i = i + 2
else:
res = res + s1
i = i + 1
return res
def get_pageInfo(pginfo):
global pno
for m in re.finditer(r'<Misc="([0-9]+)" StartPage="([^">].*)" EndPage="([^">].*)"/>',pginfo,re.I):
Start_page = m.group(2)
End_page = m.group(3)
x = Start_page
y = End_page
numeric_test = x.isnumeric()
if not numeric_test:
Start_page = romanToDecimal(Start_page)
else:
Start_page = int(Start_page)
numeric_test = y.isnumeric()
if not numeric_test:
End_page = romanToDecimal(End_page)
else:
End_page = int(End_page)
print(x, Start_page, y, End_page)
pno.append((Start_page,End_page))
return pno
pgdetails = get_pageInfo(pgcnt)
print(pgdetails)
def pdf_splitter(file,start,end,fcount):
fix_start = start
#we will save new splited pdf as "nameofpdf splitted.pdf"
#example if pdf name is "abc.pdf" then it will be saved as "abc splitted.pdf"
new_file_name = str(fcount)+".pdf"
read_file = PdfFileReader(open(file,"rb")) #read pdf
new_pdf = PdfFileWriter() #create write object
start-=1
try:
with open(new_file_name,"wb") as f:
for i in range(start, end):
new_pdf.addPage(read_file.getPage(i))
new_pdf.write(f)
i+=1
f.close()
print("PDF splitted Successfully")
reader = PdfReader(new_file_name)
labels = PageLabels.from_pdf(reader)
newlabel = PageLabelScheme(startpage=0, # the index of the page of the PDF where the labels will start
style="roman lowercase", # See options in PageLabelScheme.styles()
prefix="",
firstpagenum=fix_start) # number to attribute to the first page of this index
labels.append(newlabel) # Adding our page labels to the existing ones
labels.write(reader)
writer = PdfWriter()
writer.trailer = reader
writer.write(new_file_name)
except Exception as e:
print(e)
x = 0
for i in pgdetails:
x += 1
#pvalaue = i
Start,End = i
pdf_splitter('input.pdf',Start,End,x)
sys.exit()
and the page information file (txt) will contain the below information
<Misc="1" StartPage="i">
<Misc="1" EndPage="ii"/>
<Misc="2" StartPage="ii">
<Misc="2" EndPage="ii"/>
<Misc="3" StartPage="iv">
<Misc="3" EndPage="iv"/>
<Misc="4" StartPage="v">
<Misc="4" EndPage="vi"/>
<Misc="5" StartPage="vii">
<Misc="5" EndPage="xiv"/>
<Misc="6" StartPage="xv">
<Misc="6" EndPage="xv"/>
<Misc="7" StartPage="xvi">
<Misc="7" EndPage="xviii"/>
<Misc="8" StartPage="xix">
<Misc="8" EndPage="xx"/>
Thanks in Advance

looping through value list for multiple url requests in python

I am trying to scrape Weather Underground for years' worth of hourly data from multiple weather stations and put it into a pandas dataframe. I CANNOT use the API as there are limits on requests and I don't want to pay thousands of dollars to scrape this data.
I can get the script to scrape all of the data I want from one station. When I try to modify it so it loops through a list of stations I either get a 406 error or it returns only the data from the first station in my list. How can I loop through all the stations? Also how can I store the station name so that it can be added to the dataframe in another column?
here is what my code looks like now:
stations = ['EGMC','KSAT','CAHR']
weather_data = []
date = []
for s in stations:
for y in range(2014,2015):
for m in range(1, 13):
for d in range(1, 32):
#check if a leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
#check to see if dates have already been scraped
if (m==2 and leap and d>29):
continue
elif (y==2013 and m==2 and d > 28):
continue
elif(m in [4, 6, 9, 11] and d > 30):
continue
timestamp = str(y) + str(m) + str(d)
print ('Getting data for ' + timestamp)
#pull URL
url = 'http://www.wunderground.com/history/airport/{0}/' + str(y) + '/' + str(m) + '/' + str(d) + '/DailyHistory.html?HideSpecis=1'.format(stations)
page = urlopen(url)
#find the correct piece of data on the page
soup = BeautifulSoup(page, 'lxml')
for row in soup.select("table tr.no-metars"):
date.append(str(y) + '/' + str(m) + '/' + str(d))
cells = [cell.text.strip().encode('ascii', 'ignore').decode('ascii') for cell in row.find_all('td')]
weather_data.append(cells)
weather_datadf = pd.DataFrame(weather_data)
datedf = pd.DataFrame(date)
result = pd.concat([datedf, weather_datadf], axis=1)
result
Here is explanation of your error https://httpstatuses.com/406
You should add User-Agent to headers. But I think on this site exist some protection from crawling and you should use more specific things like Scrapy, Crawlera, proxy-list, user-agent rotator

Processing big files using requests library and DictReader- Python

So I'm trying to process huge data files (> 1.6GB) using the requests library to deal with chunks of data.
import urllib2, json, csv
import requests
import multiprocessing
def getTaxiTrips(date):
"""
Gets the taxi trips occurred in NY from a starting date.
:param date: (Y-m-d).
:return: list of tuples (long, lat, drop off date).
"""
today = str(datetime.date(datetime.now())).split('-')
today_y = today[0]
today_m = today[1]
start = date.split('-')
start_y = start[0]
start_m = start[1]
print start_m+"-"+start_y +" / "+today_m+"-"+today_y
data = []
y = int(start_y)
m = int(start_m)
while int(start_y) <= int(today_y):
# Month transformation
if m > 12:
m %= 12
y += 1
mt = str(m) if m > 9 else '0' + str(m)
# Green cabs
if readCSV("https://storage.googleapis.com/tlc-trip-data/" + str(y) + \
"/green_tripdata_" + str(y) + "-" + mt + ".csv") is not None:
data.append("https://storage.googleapis.com/tlc-trip-data/" + str(y) + \
"/green_tripdata_" + str(y) + "-" + mt + ".csv")
if m == int(today_m):
break
m += 1
pool = multiprocessing.Pool(mps-1)
result = pool.map(consumeTaxiData, data)
pool.close()
pool.join()
return list(itertools.chain(*result))
def consumeTaxiData(url):
"""
Given a url, reads its content and process its data.
:param url: the url to be readen.
:return: a list of tuples in the form (long, lat, hour).
"""
print "Processing", url
points = []
r = requests.get(url, stream=True)
for chunk in r.iter_content(chunk_size=1024):
if chunk:
reader = csv.DictReader(chunk.splitlines(), delimiter=',')
for line in reader:
print line
latitude = line.get('dropoff_latitude', None)
if latitude is None:
latitude = line.get('Dropoff_latitude', None)
longitude = line.get('dropoff_longitude', None)
if longitude is None:
longitude = line.get('Dropoff_longitude', None)
time = line.get('tpep_dropoff_datetime', None)
if time is None:
time = line.get('Lpep_dropoff_datetime', None)
if time is not None and latitude is not None and longitude is not None and \
datetime.strptime(time, '%Y-%m-%d %H:%M:%S') >= datetime.strptime(date, '%Y-%m-%d'):
time = roundTime(datetime.strptime(time, '%Y-%m-%d %H:%M:%S'), roundTo=60 * 60).hour
points.append((longitude, latitude, time))
return points
This is one file example:
https://storage.googleapis.com/tlc-trip-data/2015/green_tripdata_2015-06.csv
I'm not sure how to use this idea properly because the first line of the file is a header that specifies the key of some content that I need to capture. This may change among files, so I though about reading it using csv.DictRead. But I don't know if this works among chunks, because the header would be captured just in the first chunk, right? Is there a way to preserver the keys and be able to use csv.DictReader?

Concatenating url pages as a single Data Frame

I'm trying to download historic weather data for a given Location.
I have altered an example given at flowingdata but I've stuck in the last step - how to concate multiple Data Frames
MWE:
import pandas as pd
frames = pd.DataFrame(columns=['TimeEET', 'TemperatureC', 'Dew PointC', 'Humidity','Sea Level PressurehPa',
'VisibilityKm', 'Wind Direction', 'Wind SpeedKm/h','Gust SpeedKm/h','Precipitationmm',
'Events','Conditions', 'WindDirDegrees', 'DateUTC<br />'])
# Iterate through year, month, and day
for y in range(2006, 2007):
for m in range(1, 13):
for d in range(1, 32):
# Check if leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
#Check if already gone through month
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
# Open wunderground.com url
url = "http://www.wunderground.com/history/airport/EFHK/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html?req_city=Vantaa&req_state=&req_statename=Finlandia&reqdb.zip=00000&reqdb.magic=4&reqdb.wmo=02974&format=1"
df=pd.read_csv(url, sep=',',skiprows=2)
frames=pd.concat(df)
This gives an error:
first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"
The desired output would be to have one Data Frame with all days,month and years.
You should declare a list outside your loop and append to this then outside the loop you want to concatenate all the dfs into a single df:
import pandas as pd
frames = pd.DataFrame(columns=['TimeEET', 'TemperatureC', 'Dew PointC', 'Humidity','Sea Level PressurehPa',
'VisibilityKm', 'Wind Direction', 'Wind SpeedKm/h','Gust SpeedKm/h','Precipitationmm',
'Events','Conditions', 'WindDirDegrees', 'DateUTC<br />'])
# Iterate through year, month, and day
df_list = []
for y in range(2006, 2007):
for m in range(1, 13):
for d in range(1, 32):
# Check if leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
#Check if already gone through month
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
# Open wunderground.com url
url = "http://www.wunderground.com/history/airport/EFHK/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html?req_city=Vantaa&req_state=&req_statename=Finlandia&reqdb.zip=00000&reqdb.magic=4&reqdb.wmo=02974&format=1"
df=pd.read_csv(url, sep=',',skiprows=2)
df_list.append(df)
frames=pd.concat(df_list, ignore_index=True)

Parse the HTML Table

I have an HTML table that I need to parse into a CSV file.
import urllib2, datetime
olddate = datetime.datetime.strptime('5/01/13', "%m/%d/%y")
from BeautifulSoup import BeautifulSoup
print("dates,location,name,url")
def genqry(arga,argb,argc,argd):
return arga + "," + argb + "," + argc + "," + argd
part = 1
row = 1
contenturl = "http://www.robotevents.com/robot-competitions/vex-robotics-competition"
soup = BeautifulSoup(urllib2.urlopen(contenturl).read())
table = soup.find('table', attrs={'class': 'catalog-listing'})
rows = table.findAll('tr')
for tr in rows:
try:
if row != 1:
cols = tr.findAll('td')
for td in cols:
if part == 1:
keep = 0
dates = td.find(text=True)
part = 2
if part == 2:
location = td.find(text=True)
part = 2
if part == 3:
name = td.find(text=True)
for a in tr.findAll('a', href=True):
url = a['href']
# Compare Dates
if len(dates) < 6:
newdate = datetime.datetime.strptime(dates, "%m/%d/%y")
if newdate > olddate:
keep = 1
else:
keep = 0
else:
newdate = datetime.datetime.strptime(dates[:6], "%m/%d/%y")
if newdate > olddate:
keep = 1
else:
keep = 0
if keep == 1:
qry = genqry(dates, location, name, url)
print(qry)
row = row + 1
part = 1
else:
row = row + 1
except (RuntimeError, TypeError, NameError):
print("Error: " + name)
I need to be able to get every VEX Event in that table that is after 5/01/13. So far, this code gives me an error about the dates, that I can't seem to be able to fix. Maybe someone that is better than me can fix this code? Thanks in advance, Smith.
EDIT #1: The Error That I am Getting Is:
Value Error: '\n10/5/13' does not match format '%m/%d/%y'
I think that I need to remove newlines at the beginning of the string first.
EDIT #2: Got it to run, without any output, any help?
Your question is very poor. Without knowing what the exact error, I would guess the problem is with your if len(dates) < 6: block. Consider the following:
>>> date = '10/5/13 - 12/14/13'
>>> len(date)
18
>>> date = '11/9/13'
>>> len(date)
7
>>> date[:6]
'11/9/1'
One suggestion to make your code more Pythonic: Instead of doing row = row + 1, use enumerate.
Update: Tracing your code, I get the value of dates as follows:
>>> dates
u'\n10/5/13 - 12/14/13 \xa0\n '

Categories

Resources