How can I plot the monthly average temperature? - python

I wrote this code in order to obtain a series of monthly weather observations at Helsinki for a period from 1960 to 2020 and then I saved the data to a local file using package pickle. I used the data available from the API provided by Finnish Meteorological Institute.
import datetime
from pprint import pprint
import pickle
import pandas as pd
class FmiApi:
"""
a minimalistic wrapper for the Finnish Meteorological Institute API
"""
def __init__(self):
self.base_url = 'http://opendata.fmi.fi/wfs'
self.fixed_params = {
'service': 'WFS',
'version':'2.0.0',
}
self.namespaces ={
'wfs':'http://www.opengis.net/wfs/2.0',
'gml':'http://www.opengis.net/gml/3.2',
'BsWfs':'http://xml.fmi.fi/schema/wfs/2.0'
}
try:
with open('wdata.pkl', 'rb') as f:
data=pickle.load(f)
except:
wheaters=[]
for year in range(1960,2021):
for month in range(1,13):
api = FmiApi()
w = api.get_monthly_obs('helsinki', year, month )
wheaters.append(w)
pprint(w)
with open('wdata.pkl', 'wb') as f:
pickle.dump(wheaters, f)
Now I want to use the local file to access the data in order to plot the monthly average temperature for the years 1960 to 2020.
I wrote this code but it doesn't print the average temperature:
def get_data(file_in=None, file_out=None):
assert file_in != None or file_out != None
#read wdata from local file
df_weath = pd.read_csv('wdata.pkl', parse_dates=[0], infer_datetime_format=True)
df_weath.sort_values('Date', inplace=True, ignore_index=True)
df_weath['Date'] = df_weath['Date'].dt.date #convert to datetime objects
#print(df_wdata)
#input()
#get weather data in the format returned by the api
if file_in != None: #read weather data from local file
with open(file_in, 'rb') as f:
wdata = pickle.load(f)
else: #use the FMI API to get weather data
api = FmiApi() #instantiate the api
params = {
'place': u'helsinki',
'maxlocations': '5',
}
d0 = df_weath['date'].values[0]
d1 = df_weath['date'].values[-1]
n_days = (d1 - d0).days + 1 #number of days between d0 and d1
wdata = []
for i_day in range(n_days):
date = d0 + datetime.timedelta(days=i_day)
params['starttime'] = str(date) + 'T00:00:00'
params['endtime'] = str(date) + 'T00:00:00'
try:
print('requesting weather data for %s'%str(date))
weather = api.get_daily_obs(params)
except:
print('getting weather failed, skipping')
continue
wdata.append(weather)
if file_out != None: #save weather data to a file
with open(file_out, 'wb') as f:
pickle.dump(wdata, f)
#move weather data to a pandas dataframe (calculate avg over valid observations)
data = []
variables = ['tday']
for wobs in wdata:
avgs = {}
for pos, obs in wobs.items():
for var, xx in obs.items():
if not var in variables: continue
sdate = xx[1][:10]
date = datetime.date(*[int(z) for z in sdate.split('-')])
if xx[0] != 'NaN':
val = float(xx[0])
else:
val = None
if not var in avgs: avgs[var] = []
if val != None: avgs[var].append(val)
vals = []
for var in variables: #calculate the average when available
if len(avgs[var]) > 0:
vals.append(sum(avgs[var])/len(avgs[var]))
else:
vals.append(None)
row = [date] + vals
data.append(row)
df_weather = pd.DataFrame(data, columns=['date'] + variables)
print(df_weather)
input()
#merge the dataframes by date (pick dates from weather df and add NaNs for missing values)
df = pd.merge(df_weather, df_weath, on='date', how='left', sort=True)
return df
The dictionary is like this:
{(65.90051, 29.06302): {'rrmon': ('18.4', '1985-01-01T00:00:00Z'),
'tmon': ('NaN', '1985-01-01T00:00:00Z')},
(65.91718, 29.22969): {'rrmon': ('15.0', '1985-01-01T00:00:00Z'),
'tmon': ('NaN', '1985-01-01T00:00:00Z')},
(65.95515, 29.1347): {'rrmon': ('16.9', '1985-01-01T00:00:00Z'),
'tmon': ('NaN', '1985-01-01T00:00:00Z')}}
Can you help me to find what I am doing wrong? Or do you know any easier way to plot the monthly average temperature?

Related

How to wrap text in excel using Python?

I created a script which parse YAML files to Excel. I've made part of code which adjust width of column but now I want to wrap text for Desciprtion column.
After script execution:
What I want:
Here is my code:
import yaml
import os
import pandas as pd
from openpyxl.utils import get_column_letter
import sys
def yaml_to_excel(dictionary, row):
for key, value in dictionary.items():
if type(value) is dict:
yield from yaml_to_excel(value, row + 1)
elif type(value) is list:
for item in value:
yield from yaml_to_excel(item, row + 1)
row += 1
else:
yield (key, value, row)
def get_all_subdirectiries_in_path(root_dir):
all_subdirectories = []
for file in os.listdir(root_dir):
d = os.path.join(root_dir, file)
if os.path.isdir(d):
all_subdirectories.append(d)
return all_subdirectories
def dfs_tabs(df_list, sheet_list, file_name, columns):
writer = pd.ExcelWriter(file_name,engine='openpyxl')
for dataframe, sheet, columns_list in zip(df_list, sheet_list, columns):
dataframe.to_excel(writer, sheet_name=sheet, startrow=0 , startcol=0, index=False, columns=columns_list)
for column in dataframe:
column_width = max(dataframe[column].astype(str).map(len).max() + 5, len(column) + 5)
col_letter = get_column_letter(dataframe.columns.get_loc(column) + 1)
writer.sheets[sheet].column_dimensions[col_letter].width = column_width
writer.save()
def get_nested_dir_path(main_dir):
if any(File.endswith(".yml") for File in os.listdir(main_dir)):
yield main_dir
else:
for file in os.listdir(main_dir):
yield from get_nested_dir_path(main_dir + f"/{file}")
def main(dir_path):
all_sheets = []
sheet_names = []
all_columns = []
for subdirectory in get_nested_dir_path(dir_path):
final_object = []
last_row = 1
current_row = 1
for file in os.listdir(subdirectory):
# Check if YAML file exist
if (file.split(".")[1] == "yml") | (file.split(".")[1] == "yaml"):
with open(subdirectory + f"/{file}", "r", encoding="utf-8") as f:
dataMap = yaml.safe_load(
f,
)
last_row += 1
hierarchy_obj = {}
hierarchy = 1
prev_value = ""
prev_temp_depth = 2
current_row += 1
starting_row = current_row
for key, value, temp_depth in yaml_to_excel(dataMap, last_row):
if key not in hierarchy_obj:
hierarchy_obj[key] = hierarchy
hierarchy += 1
if prev_value != "":
if hierarchy_obj[key] < hierarchy_obj[prev_value]:
current_row += 1
if (temp_depth > prev_temp_depth) & (
hierarchy_obj[key] > hierarchy_obj[prev_value]
):
current_row += 1
if type(value) is bool:
if value:
value = 'yes'
else:
value = 'no'
final_object.append(
{
"column": hierarchy_obj[key],
"value": value,
"row": current_row,
}
)
prev_value = key
prev_temp_depth = temp_depth
columns_arr = []
for column_name in hierarchy_obj.keys():
columns_arr.append(column_name)
all_columns.append(columns_arr)
last_row_in_excel = final_object[len(final_object)-1]['row']
main_arr = []
for i in range(last_row_in_excel):
main_arr.append(["" for i in range(len(columns_arr))])
main_arr = [main_arr]
for item in final_object:
if (type(item['value']) == int) | (type(item['value']) == float):
main_arr[0][item['row'] - 1][item['column'] - 1] = item['value']
else:
main_arr[0][item['row'] - 1][item['column'] -1] = item['value']
result = pd.DataFrame(main_arr[0], columns = columns_arr)
result = result[1:]
all_sheets.append(result)
sheet_names.append(subdirectory[subdirectory.rfind('/')+1 :].replace("/", " ").replace("_", " ")[0:31])
dfs_tabs(all_sheets, sheet_names, 'test.xlsx', all_columns)
if __name__ == "__main__":
dir_path = 'Lands'
main(dir_path)
And below you can find YAML file which I base of:
Country: France
Capital City: Paris
Language: French
Description: |-
A country whose metropolitan part is in Western Europe, and also has overseas territories on other continents.
The most important people:
President: Emmanuel Macron
Prime Minister: Élisabeth Borne
Year: 2020
Population:
- Total population: 67 522 000
Year: 2021
Nationality:
- French population: 87%
German population: 2%
Year: 2022
Paris neighborhoods:
- 1st district of Paris: Louvre
Paris II district: Bourse
Year: 2023
Additional information:
To run my script you must create folder Lands/France/france.yml
I will be grateful for any tips how to wrap text in Description column!
I suggest you in this particular situation to parse the .YAML file with pandas.json_normalize and use the format class of xlswriter to wrap the text in the column Description.
Try this :
import pandas as pd
from yaml import safe_load
with open(r"Lands\France\france.yaml", 'r', encoding='utf-8') as f:
df = pd.json_normalize(safe_load(f))
# --- Selecting columns
description_index = df.columns.get_loc("Description")
df.iloc[:, 0:description_index+1]
with pd.ExcelWriter('final_result.xlsx') as writer :
df.to_excel(writer, sheet_name='Sheet1', index=False)
# --- Wrapping the text of column D
workbook=writer.book
worksheet = writer.sheets['Sheet1']
format = workbook.add_format({'text_wrap': True})
worksheet.set_column(description_index, description_index, None, format)
# --- Auto-fit all the columns
for column in df:
column_width = max(df[column].astype(str).map(len).max(), len(column))
col_idx = df.columns.get_loc(column)
worksheet.set_column(col_idx, col_idx, column_width)
# Output (in Excel) :

Split json file into multiple csv files depending on date?

I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1

Convert DF column values to column (like pivot)

I am scraping api data and totaling counts of different values into a dictionary 'building':'count' for each player (row). I would like to be able to analyze it further. An easy solution would be to pull the different unique 'buildings' (dictionary keys within the row) as dataframe columns and then do the equivalent of an index/match/match on them. The script currently gets the data, and I can extract the unique keys, but I am lost at how to make them into DF columns and then how to do the index/match/match. There may be a better approach from even before running the 'count' part of the script.
You should be able to run the script, no credentials are required to GET against the API. If you see the ranklist DF column with the building counts you will see what I am referencing.
Thank you for any guidance!
import requests
import pandas as pd
from datetime import datetime
from datetime import date
from datetime import timedelta
import operator
from time import sleep
ranklist = pd.DataFrame()
for i in range(430):
baserank_url = 'https://www.simcompanies.com/api/v3/encyclopedia/ranking/' + str(i) + '/'
r = requests.get(baserank_url)
rank_json = r.json()
df = pd.DataFrame.from_dict(rank_json)
df=df.filter(['company','id','rank','value'])
ranklist = ranklist.append(df)
ranklist.to_csv(r'rank_dataframe.csv',index=False)
print('Ranking list started succesfully!')
levellist=[]
bcolist=[]
today= date.today()
for row in ranklist.itertuples():
attempt = 0
while True:
if attempt == 6:
break
try:
print(str(row.rank + 1) +' ' + str(attempt))
account_url = 'https://www.simcompanies.com/api/v2/players/' + str(row.id) + '/'
r = requests.get(account_url)
account_json = r.json()
playerid = account_json.get("player").get("id")
playerlevel = account_json.get("player").get("level")
datestart = datetime.strptime(account_json.get("player").get("dateJoined")[:10],'%Y-%m-%d').date()
yearsactive = round((today - datestart)/ timedelta(days=365.2425),2)
buildings = account_json.get("buildings")
certificates = account_json.get("certificates")
bnames = [d['name'] for d in buildings]
bnames = [n.replace('Park','Recreation').replace('Lake','Recreation').replace('Castle','Recreation') for n in bnames]
cnames = [d['name'] for d in certificates]
sptr = 'Yes' if 'Supporter' in cnames else 'No'
dictOfElems = dict()
for elem in bnames:
if elem in dictOfElems:
dictOfElems[elem] += 1
else:
dictOfElems[elem] = 1
blist = {key:value for key, value in dictOfElems.items()}
blist = dict(sorted(blist.items(),key=operator.itemgetter(1),reverse=True))
bcolist.append([blist.keys()])
levellist.append([playerid, playerlevel,sptr, datestart,yearsactive,blist])
except:
sleep(20)
attempt +=1
continue
break
#get unique building values
bcodf= pd.DataFrame(bcolist,columns=['buildings'])
bcouni = list(set([a for b in bcodf.buildings.tolist() for a in b]))
print(bcouni)
leveldf = pd.DataFrame(levellist,columns=['id','level','sptr','datestart','yearsactive','blist'])
#clist = list(set([a for b in leveldf.cnames.tolist() for a in b]))
#print(leveldf[blist])
#bul = leveldf[blist].keys()
#buniq = list(set([a for b in leveldf.bul.tolist() for a in b]))
#print(bul)
ranklist = ranklist.merge(leveldf, on='id', how='left')
ranklist['rank'] +=1
ranklist.to_csv(r'rank_dataframe.csv',index=False)

Making Excel histograms using python

This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()

How to interpret duplicate hourly records - Wunderground API

For a specific historical query: http://api.wunderground.com/api/MY_API_KEY/history_20131231/geolookup/q/Beijing/Beijing.json
I get periodic, duplicate data for a given hour. For example, On January 1st, 2014 at 2:00 pm there will be two rows with distinct data, such as:
"Year", "Month", "Day", "Hour", "Min", "Humidity", "Temperature_F", "Windspeed_mph", "Wind_direction_deg", "Air_Pressure_mb"
"2014","01","01","14","00","7","53","6.7","250","1014"
"2014","01","01","14","00","13","51.8","17.9","300","1014"
I don't think it's a problem with my code, but rather the api. How do I explain this? Anyway, here is my code:
'''
Takes a date and file name, returns an hourly print out of historical weather
data, including humidity, temperature, wind speed, wind direction, and air pressure
'''
def print_column_headings(headers, file_name):
with open (file_name, 'wb') as initial_file:
w = csv.writer(initial_file, quoting=csv.QUOTE_ALL)
w.writerow(headers)
initial_file.close()
def print_one_day_of_weather_data(year, month, day, max_rows, file_to_write):
# get data
url = 'http://api.wunderground.com/api/<MY_API_KEY>/history_' + year + month + day +'/geolookup/q/Beijing/Beijing.json'
f = urllib2.urlopen(url)
print url
json_string = f.read()
parsed_json = json.loads(json_string)
# paramterize data
all_obs = parsed_json['history']['observations']
# if we reach the end of the observations
for n in range(0, len(all_obs)):
# or we exceed the max rows desired
if n > max_rows:
return 0
else:
base_path = all_obs[n]
date_path = base_path['date']
year = date_path['year']
month = date_path['mon']
day = date_path['mday']
hour = date_path['hour']
min = date_path['min']
humidity = base_path['hum']
temp_f = base_path['tempi']
windspeed_mph = base_path['wspdi']
winddir_deg = base_path['wdird']
air_pressure_mb = base_path['pressurem']
# utc time
params = [year, month, day, hour, min, humidity, temp_f, windspeed_mph, winddir_deg, air_pressure_mb]
print params
with open (file_to_write, 'a') as csvfile:
w = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
w.writerow(params)
csvfile.close()
headers = ("Year", "Month", "Day", "Hour", "Min", "Humidity", "Temperature_F", "Windspeed_mph", "Wind_direction_deg", "Air_Pressure_mb")
def append_leading_zeroes(num):
return "%02d" % (num,)
def days_in_a_month(year, month_num):
return calendar.monthrange(year, month_num)[1]
def file_namer(city, month_num, year):
return "raw weather - local time " + city + "-" + calendar.month_name[month_num] + "-" + str(year) + ".csv"
def gen_entire_month(city, month, year, should_print_headers, start_at_day=1):
file_name = file_namer(city, month, year)
if should_print_headers:
print_column_headings(headers, file_name)
for day in range(start_at_day, days_in_a_month(year, month) + 1):
print_one_day_of_weather_data(str(year), append_leading_zeroes(month), append_leading_zeroes(day), 100, file_name)
# if we make too many calls in a short period of time, the API refuses are calls
time.sleep(60)
gen_entire_month("Beijing", 1, 2014, should_print_headers=True)
We get some obs from 2 NWS or World Meteorological Organization sources. Look in the field "metar" and only use readings that start with "METAR ..." because those are from the obs station. The ones that begin with anything else, like AAXX are special readings. They are also legit, but by blending the 2 sources, its confusing.

Categories

Resources