Concatenating url pages as a single Data Frame - python

I'm trying to download historic weather data for a given Location.
I have altered an example given at flowingdata but I've stuck in the last step - how to concate multiple Data Frames
MWE:
import pandas as pd
frames = pd.DataFrame(columns=['TimeEET', 'TemperatureC', 'Dew PointC', 'Humidity','Sea Level PressurehPa',
'VisibilityKm', 'Wind Direction', 'Wind SpeedKm/h','Gust SpeedKm/h','Precipitationmm',
'Events','Conditions', 'WindDirDegrees', 'DateUTC<br />'])
# Iterate through year, month, and day
for y in range(2006, 2007):
for m in range(1, 13):
for d in range(1, 32):
# Check if leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
#Check if already gone through month
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
# Open wunderground.com url
url = "http://www.wunderground.com/history/airport/EFHK/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html?req_city=Vantaa&req_state=&req_statename=Finlandia&reqdb.zip=00000&reqdb.magic=4&reqdb.wmo=02974&format=1"
df=pd.read_csv(url, sep=',',skiprows=2)
frames=pd.concat(df)
This gives an error:
first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"
The desired output would be to have one Data Frame with all days,month and years.

You should declare a list outside your loop and append to this then outside the loop you want to concatenate all the dfs into a single df:
import pandas as pd
frames = pd.DataFrame(columns=['TimeEET', 'TemperatureC', 'Dew PointC', 'Humidity','Sea Level PressurehPa',
'VisibilityKm', 'Wind Direction', 'Wind SpeedKm/h','Gust SpeedKm/h','Precipitationmm',
'Events','Conditions', 'WindDirDegrees', 'DateUTC<br />'])
# Iterate through year, month, and day
df_list = []
for y in range(2006, 2007):
for m in range(1, 13):
for d in range(1, 32):
# Check if leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
#Check if already gone through month
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
# Open wunderground.com url
url = "http://www.wunderground.com/history/airport/EFHK/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html?req_city=Vantaa&req_state=&req_statename=Finlandia&reqdb.zip=00000&reqdb.magic=4&reqdb.wmo=02974&format=1"
df=pd.read_csv(url, sep=',',skiprows=2)
df_list.append(df)
frames=pd.concat(df_list, ignore_index=True)

Related

Why my openpyxl code is slower than my VBA code?

I have an excel file of nearly 95880 rows. I made a VBA function that runs slow, so I tried to code a python script using openpyxl, but it's even slower.
It starts fast, then after 600 rows becomes slower and slower.
The VBA Code is
Option Explicit
Function FTE(Assunzione As Date, Cess As Variant, Data)
Dim myDate As Date
Dim EndDate As Date, EndDate2 As Date
Dim check As Integer
EndDate = Application.WorksheetFunction.EoMonth(Assunzione, 0)
myDate = #1/1/2022#
If Cess = 0 Then
Call Check2(Assunzione, Data, myDate, EndDate, check)
FTE = check
Else:
EndDate2 = Application.WorksheetFunction.EoMonth(Cess, -1)
Call Check1(Assunzione, Cess, Data, myDate, EndDate, EndDate2, check)
FTE = check
End If
End Function
Sub Check1(Assunzione, Cess, Data, myDate, EndDate, EndDate2, check)
Dim Cess1 As Date
Dim gg_lav As Integer, gg_lav2 As Integer
Cess1 = Cess.Value
If Assunzione > Date Then
check = 0
Else
If Month(Assunzione) <= Month(Data) And Year(Assunzione) = 2022 Then
If Assunzione > myDate Then
gg_lav = Application.WorksheetFunction.Days(EndDate, Assunzione) + 1
If gg_lav >= 15 Then
If Month(Data) = (Month(EndDate2) + 1) And Year(Cess1) = 2022 Then
gg_lav2 = Application.WorksheetFunction.Days(Cess1, EndDate2)
If gg_lav2 >= 15 Then
check = 1
Else
check = 0
End If
Else
check = 1
End If
Else
check = 0
End If
Else
check = 1
End If
Else
check = 1
End If
End If
End Sub
Sub Check2(Assunzione, Data, myDate, EndDate, check)
Dim gg_lav As Integer
If Assunzione > Date Then
check = 0
Else
If Month(Assunzione) <= Month(Data) And Year(Assunzione) = 2022 Then
If Assunzione > myDate Then
gg_lav = Application.WorksheetFunction.Days(EndDate, Assunzione) + 1
If gg_lav >= 15 Then
check = 1
Else
check = 0
End If
Else
check = 1
End If
Else
check = 1
End If
End If
End Sub
and my openpyxl is:
def check1(a,d,c,i):
if ws.cell(row=i,column=a).value > ws.cell(row=i,column=d).value:
return 0
else:
if ws.cell(row=i,column=a).value.month == ws.cell(row=i,column=d).value.month and ws.cell(row=i,column=a).value.year == 2022:
EndDate = date(ws.cell(row=i,column=a).value.year, ws.cell(row=i,column=a).value.month,
calendar.monthrange(ws.cell(row=i,column=a).value.year,
ws.cell(row=i,column=a).value.month)[1])
gg_lav = (EndDate - datetime.date(ws.cell(row=i,column=a).value)).days
if gg_lav >= 15:
EndDate2 = date(ws.cell(row=i,column=c).value.year,ws.cell(row=i,column=c).value.month-1,
calendar.monthrange(ws.cell(row=i,column=c).value.year,
ws.cell(row=i,column=c).value.month-1)[1])
if ws.cell(row=i,column=d).value.month == EndDate2.month and ws.cell(row=i,column=c).value.year == 2022:
gg_lav2 = (datetime.date(ws.cell(row=i,column=c).value)-EndDate2).days
if gg_lav2 >= 15:
return 1
else:
return 0
else:
return 1
else:
return 0
else:
return 1
def check2(a,d,i):
if ws.cell(row=i,column=a).value > ws.cell(row=i,column=a).value:
return 0
else:
if ws.cell(row=i,column=a).value.month == ws.cell(row=i,column=d).value.month and ws.cell(row=i,column=a).value.year == 2022:
EndDate = date(ws.cell(row=i,column=a).value.year, ws.cell(row=i,column=a).value.month,
calendar.monthrange(ws.cell(row=i,column=a).value.year,
ws.cell(row=i,column=a).value.month)[1])
gg_lav = (EndDate - datetime.date(ws.cell(row=i,column=a).value)).days
if gg_lav >= 15:
return 1
else:
return 0
else:
return 1
wb1 = Workbook()
ws1 = wb1.create_sheet()
for i in range(2,95882):
if ws.cell(row = i, column = c).value == None:
ws1.cell(row = i, column = 1, value = check2(a, d, i))
else:
ws1.cell(row = i, column = 1, value = check1(a, d, c, i))
What am I doing wrong? Should I use another library or I'm making the code uselessy memory consuming?
Thank you very much for any help!
Update: I think that the problem was with openpyxl. First I tried to reduce the number of observation, from 95K to almost 5K, but it required two and half hour to complete the task.
So I used numpy and it took 55 seconds. Yeah, that's the difference in processing speed.
Here I post the code:
with open('data.csv','r') as f:
data = list(csv.reader(f,delimiter =';'))
arr = np.array(data)
arr = np.resize(arr,(4797,13))
I had to change of course the code in this section:
a = 3
d = 0
c = 4
def check1(a,d,c,i):
if int(arr[i][a]) > int(arr[i][d]):
return 0
else:
za = datetime.fromordinal((int(arr[i][a]) + 693594))
zd = datetime.fromordinal((int(arr[i][d]) + 693594))
da = date(za.year, za.month, za.day)
dd = date(zd.year, zd.month, zd.day)
if za.month == zd.month and za.year + 1899 == 2022:
EndDate = date(za.year, za.month,
calendar.monthrange(za.year,
za.month)[1])
gg_lav = (EndDate - da).days
if gg_lav >= 15:
zc = datetime.fromordinal((int(arr[i][c]) + 693594))
dc = date(zc.year, zc.month, zc.day)
EndDate2 = date(zc.year,zc.month-1,
calendar.monthrange(zc.year,
zc.month-1)[1])
if zd.month == EndDate2.month and zc.year == 2022:
gg_lav2 = (dc-EndDate2).days
if gg_lav2 >= 15:
return 1
else:
return 0
else:
return 1
else:
return 0
else:
return 1
I don't report the check2 function
fte = np.array(10)
for i in range(1,4797):
if arr[i][c] == '':
fte = np.append(fte,check2(a,d,i))
else:
fte = np.append(fte,check1(a, d, c, i))
print(i)

Failing to fill multiple rows in Pandas Dataframe with .iloc() method

I'm having a hard time adding multiple rows of data to a dataset I have. For some reason, when I try to add values, it only sets the first row, and the rest of the rows are set to NaN.
It outputs the following as the head:
Time Open Tone
0 1449341520 377.48 0
1 1449341580 377.50 NaN
2 1449341640 377.50 NaN
3 1449341760 377.50 NaN
4 1449341820 377.50 NaN
Expected:
Time Open Tone
0 1449341520 377.48 0
1 1449341580 377.50 0
2 1449341640 377.50 0
3 1449341760 377.50 0
4 1449341820 377.50 0
btc_data.iloc[:position, -1] = to_btc_data["tone_id"] is supposed to set those rows up to what position is, but instead it only sets the first row.
Full code:
from pygooglenews import GoogleNews
import datetime
import time
from datetime import timezone
from datetime import datetime
import json
import numpy as np
import pandas as pd
gn = GoogleNews()
def create_week_data(ts=1449341520):
df = pd.DataFrame()
search = gn.search('Bitcoin', from_=datetime.fromtimestamp(ts).strftime('%Y-%m-%d'), to_=datetime.fromtimestamp(ts+604800).strftime('%Y-%m-%d'))
titles=[]
dates=[]
timestamps=[]
for item in search["entries"]:
titles.append(item["title"])
dates.append(item["published"])
df["published"] = dates
df["title"] = titles
for item in dates:
month = 1
if (item[8:11] == "Jan"):
month = 1
elif (item[8:11] == "Feb"):
month = 2
elif (item[8:11] == "Mar"):
month = 3
elif (item[8:11] == "Apr"):
month = 4
elif (item[8:11] == "May"):
month = 5
elif (item[8:11] == "Jun"):
month = 6
elif (item[8:11] == "Jul"):
month = 7
elif (item[8:11] == "Aug"):
month = 8
elif (item[8:11] == "Sep"):
month = 9
elif (item[8:11] == "Oct"):
month = 10
elif (item[8:11] == "Nov"):
month = 11
elif (item[8:11] == "Dec"):
month = 12
dt = item[5:7]+"/"+str(month)+"/"+item[12:16]
timestamps.append(int(time.mktime(datetime.strptime(dt, "%d/%m/%Y").timetuple())))
df["timestamp"] = timestamps
return df
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
import json
watson_api_key = "KEY"
watson_url = "URL"
authenticator = IAMAuthenticator(watson_api_key)
tone_analyzer = ToneAnalyzerV3(version='2021-11-27', authenticator=authenticator)
tone_analyzer.set_service_url(watson_url)
def to_tone(df = df):
toneData = pd.DataFrame()
tones = []
timestamps = []
for index, row in df.iterrows():
timestamp = row["timestamp"]
title = row["title"]
dump = json.dumps(tone_analyzer.tone(title).get_result())[10:]
tone_id = 0;
if (dump.find("Joy") >= 0):
tone_id = 1
elif (dump.find("Anger") >= 0):
tone_id = 2
elif (dump.find("Fear") >= 0):
tone_id = 3
elif (dump.find("Sadness") >= 0):
tone_id = 4
elif (dump.find("Analytical") >= 0):
next
elif (dump.find("Confident") >= 0):
next
elif (dump.find("Tentative") >= 0):
next
else:
tone_id = 0
tones.append(tone_id)
timestamps.append(timestamp)
if (len(tones) >= 1):
break
toneData["timestamp"] = timestamps
toneData["tone_id"] = tones
return toneData
print(to_tone())
import numpy as np
import pandas as pd
np.set_printoptions(precision=15, suppress=True)
btc_data = pd.read_csv(
"/content/drive/MyDrive/Science Fair/output.csv",
names=["Time", "Open"])
btc_data["Tone"] = None
ts = 1449341520
position = 0
while (True):
to_btc_data = pd.DataFrame()
to_btc_data = to_tone(df=create_week_data(ts=ts))
for index, row in btc_data.iterrows():
timestamp = row["Time"]
if (timestamp >= ts):
ts = timestamp
position = index
break
btc_data.iloc[:position, -1] = to_btc_data["tone_id"]
ts+=604800
print(btc_data.head())
if (ts > 1617148800):
btc_data.iloc[ts+604800:] = to_btc_data[0]
break
btc_data.head()
Whatever value is in the to_btc_data DF is supposed to be set there. What am I doing wrong here?
Also sorry about the terribly structured code. I am playing on organizing it during production, and I usually don't code in Python so I won't be writing any beautiful code.

looping through value list for multiple url requests in python

I am trying to scrape Weather Underground for years' worth of hourly data from multiple weather stations and put it into a pandas dataframe. I CANNOT use the API as there are limits on requests and I don't want to pay thousands of dollars to scrape this data.
I can get the script to scrape all of the data I want from one station. When I try to modify it so it loops through a list of stations I either get a 406 error or it returns only the data from the first station in my list. How can I loop through all the stations? Also how can I store the station name so that it can be added to the dataframe in another column?
here is what my code looks like now:
stations = ['EGMC','KSAT','CAHR']
weather_data = []
date = []
for s in stations:
for y in range(2014,2015):
for m in range(1, 13):
for d in range(1, 32):
#check if a leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
#check to see if dates have already been scraped
if (m==2 and leap and d>29):
continue
elif (y==2013 and m==2 and d > 28):
continue
elif(m in [4, 6, 9, 11] and d > 30):
continue
timestamp = str(y) + str(m) + str(d)
print ('Getting data for ' + timestamp)
#pull URL
url = 'http://www.wunderground.com/history/airport/{0}/' + str(y) + '/' + str(m) + '/' + str(d) + '/DailyHistory.html?HideSpecis=1'.format(stations)
page = urlopen(url)
#find the correct piece of data on the page
soup = BeautifulSoup(page, 'lxml')
for row in soup.select("table tr.no-metars"):
date.append(str(y) + '/' + str(m) + '/' + str(d))
cells = [cell.text.strip().encode('ascii', 'ignore').decode('ascii') for cell in row.find_all('td')]
weather_data.append(cells)
weather_datadf = pd.DataFrame(weather_data)
datedf = pd.DataFrame(date)
result = pd.concat([datedf, weather_datadf], axis=1)
result
Here is explanation of your error https://httpstatuses.com/406
You should add User-Agent to headers. But I think on this site exist some protection from crawling and you should use more specific things like Scrapy, Crawlera, proxy-list, user-agent rotator

Python difference between two dates

For some reason I'm really stumped with this (relativity) question.
How to calculate the difference between two dates. I want to do this without using modules. But for some reason my code isn't outputting the correct answer.
This is my thought process:
If asked calculate the # of days between Dec 10th 2014 and Feb 2nd 2015.
First find the number of days left in Dec from the 10th on (31 - 10)
= 21 days
Find the number of months between Dec and Feb ( aka Jan) add the
number days in that month = 31 days
Add the Days left in Dec (21) + the days in between the months (31) + the days in the last month (2) = 54 days.
Then check for anomalies ie Leap Year etc.
This is my function:
def Calculate_Date (year1, month1, day1, year2, month2, day2):
"""
This function takes to dates (year/month/day) and returned the
difference between the dates
"""
#Create a dict for the # of days in each month
month_days = {1:31,2:28,3:31,4:30,5:31,6:30,7:31,8:31,9:30,10:31,11:30,12:31}
days_left_in_month1 = month_days[month1] - day1
days_left_in_year1 =0
days_into_year2 =0
days_between_year1_and_year2= 0
difference_in_days = 0
# Find the number days left in year one
i = month1
days_left_in_year = []
while i <= 12:
days = month_days[i]
days_left_in_year.append(days)
i = i + 1
days_left_in_year1 = (sum(days_left_in_year)) - day1
# Find the number days into year two
i = 1
days_into_year = []
while i <= month2:
days = month_days[i]
days_into_year.append(days)
i = i + 1
days_into_year2 = sum(days_into_year) - day2
#find the differernce in years
days_between_year1_and_year2 = (year2 - year1) * 365
#Check if its a leap year
leap_year = False
while True:
if float(year1 % 4) == 0:
if float(year1 % 100) != 0:
leap_year = True
break
if float(year1 % 100) == 0:
if float(year1 % 400) ==0:
leap_year = True
break
else:
break
#test output
print "The number of days left in the year One are %r " % days_left_in_year1
print "The number of days into the year Two are %r " % days_into_year2
print "The number of days between the years are %r " % days_between_year1_and_year2
#add an increment if leap year was true
if leap_year == True:
difference_in_days = days_left_in_year1 + days_into_year2 + days_between_year1_and_year2 + 1
else:
difference_in_days = days_left_in_year1 + days_into_year2 + days_between_year1_and_year2
return difference_in_days
print Calculate_Date(2011,6,30,2012,06,30)
Instead of doing date2 - date1, you might find it simpler to do (date2 - x) - (date1 - x) where x is an easy-to-handle date, ie "Jan 0" of year1.
Let's define a couple of functions:
def days_in_month(year, month):
"""
Return number of days in the specified month (28, 29, 30, or 31)
"""
if month == 2: # February
if not (year % 400):
return 29
elif not (year % 100):
return 28
elif not (year % 4):
return 29
else:
return 28
elif month in {1, 3, 5, 7, 8, 10, 12}:
return 31
else:
return 30
def days_in_year(year):
"""
Return the number of days in the specified year (365 or 366)
"""
return 337 + days_in_month(year, 2)
def days_this_year(year, month, day):
"""
Return the number of days so far this year
"""
return sum(days_in_month(year, m) for m in range(1, month)) + day
def year_days_since(base_year, this_year):
"""
Return the number of days from the start of base_year to the start of this_year
"""
if base_year > this_year:
raise ValueError("base_year must be <= this_year")
elif base_year == this_year:
return 0
else:
return sum(days_in_year(y) for y in range(base_year, this_year))
then the difference between two dates becomes:
def date_diff(y1, m1, d1, y2, m2, d2):
x = min(y1, y2) # base date
days1 = year_days_since(x, y1) + days_this_year(y1, m1, d1)
days2 = year_days_since(x, y2) + days_this_year(y2, m2, d2)
return days2 - days1
and because of the symmetry in this answer it will also happily do negative differences:
date_diff(2001, 1, 3, 2002, 2, 5) # => 398 == 365 + 31 + 2
date_diff(2002, 2, 5, 2001, 1, 3) # => -398
In case this is a real code, and not a school assignment, this is the way I'd do it:
from datetime import date
def date_diff(y1, m1, d1, y2, m2, d2):
return (date(y2, m2, d2) - date(y1, m1, d1)).days

Having trouble using BeautifulSoup to parse WeatherUnderground

I'm trying to adapt a code to extract information from wunderground. However the script that I'm trying to adapt was written in 2008 and the formating on weather underground has changed. I'm having trouble with soup.body.nobr.b.string. I want to extract daily percipitation data from a given site. http://www.wunderground.com/history/airport/KBUF/2011/5/2/DailyHistory.html
import urllib2
from BeautifulSoup import BeautifulSoup
# Create/open a file called wunder.txt (which will be a comma-delimited file)
f = open('wunder-data.txt', 'w')
# Iterate through year, month, and day
for y in range(1980, 2007):
for m in range(1, 13):
for d in range(1, 32):
# Check if leap year
if y%400 == 0:
leap = True
elif y%100 == 0:
leap = False
elif y%4 == 0:
leap = True
else:
leap = False
# Check if already gone through month
if (m == 2 and leap and d > 29):
continue
elif (m == 2 and d > 28):
continue
elif (m in [4, 6, 9, 10] and d > 30):
continue
# Open wunderground.com url
url = "http://www.wunderground.com/history/airport/KBUF/"+str(y)+ "/" + str(m) + "/" + str(d) + "/DailyHistory.html"
page = urllib2.urlopen(url)
# Get temperature from page
soup = BeautifulSoup(page)
dayTemp = soup.body.nobr.b.string
# Format month for timestamp
if len(str(m)) < 2:
mStamp = '0' + str(m)
else:
mStamp = str(m)
# Format day for timestamp
if len(str(d)) < 2:
dStamp = '0' + str(d)
else:
dStamp = str(d)
# Build timestamp
timestamp = str(y) + mStamp + dStamp
# Write timestamp and temperature to file
f.write(timestamp + ',' + dayTemp + '\n')
# Done getting data! Close file.
f.close()
Don't mess with parsing the HTML, it'll likely change again without notice.
Get one of their CSV files (there are links at the bottom of the HTML pages), and parse it with the csv module.

Categories

Resources