I need to parse a big csv file (1Gb), which contains weather data.The file itself is here:
ftp://ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/2014.csv.gz
Additional info (stations code and file format):
ftp:// ftp.ncdc.noaa.gov/pub/data/ghcn/daily/ghcnd-stations.txt
ftp:// ftp.ncdc.noaa.gov/pub/data/ghcn/daily/by_year/readme.txt
I need to find there information on Kiev and Dnipropetrovsk and visualise monthly averages.
I have written an algorithm for finding data and it's averages, but it doesn't give me a data on the latest month.
import csv
import matplotlib.pyplot as plt
f = open('2014.csv', 'rb')
try:
reader = csv.reader(f)
avgK = 0
avgD = 0
date = 0
mon = 1
avergK = []
avergD = []
count_date = 1
for row in reader:
if row[2] == 'TAVG':
count_date +=1
date = (int(row[1]) % 10000)
if row[0] == 'UPM00033345':
avgK += float(row[3])/10.0
elif row[0] == 'UPM00034504':
avgD += float(row[3])/10.0
if (date//100 > mon):
print date //100, mon, date%100, avgK, avgD
avergK.append(avgK/count_date)
avergD.append(avgD/count_date)
mon += 1
avgK = 0
avgD = 0
count_date = 1
continue
finally:
f.close()
plt.subplot(2, 1, 1)
plt.plot(avergK)
plt.xlabel('Month')
plt.ylabel('Average Temperature')
plt.title('AVG in Kiev 2014')
plt.grid(True)
plt.subplot(2, 1, 2)
plt.plot(avergD)
plt.xlabel('Month')
plt.ylabel('Average Temperature')
plt.title('AVG in DNIPROPETROVSK 2014')
plt.grid(True)
plt.show()
Is it possible to solve it using pandas?
Maybe you could use pandas here, but you do not need them to solve current problem. What happens is that you store a monthly average only when you find a line with a new month. But when you reach end of file, you should alse process last month.
Your loop should be:
for row in reader:
if row[2] == 'TAVG':
count_date +=1
date = (int(row[1]) % 10000)
if row[0] == 'UPM00033345':
avgK += float(row[3])/10.0
elif row[0] == 'UPM00034504':
avgD += float(row[3])/10.0
if (date//100 > mon):
print date //100, mon, date%100, avgK, avgD
avergK.append(avgK/count_date)
avergD.append(avgD/count_date)
mon += 1
avgK = 0
avgD = 0
count_date = 1
continue
# store values for last month
avergK.append(avgK/count_date)
avergD.append(avgD/count_date)
Related
I know this should be simple but I’m new to programming and could use some assistance in solving this thank you
def daycalc(value, count):
for row in my_reader:
cnt = 0
if row[21] == 'Friday' or row[21] == 'Saturday' or row[21] == 'Sunday':
tot = tot + float(row[11])
cnt += 1
return tot, cnt
with open('POS.csv') as csvfile:
my_reader = csv.reader(csvfile, delimiter=',')
fricnt = 0
satcnt = 0
suncnt = 0
sun = 0
door = 0
hdr = []
sales = 0
frisales = 0
satsales = 0
sunsales = 0
total = 0
totcnt = 0
for row in my_reader:
if door == 0:
hdr.append(row)
door = 1
elif row[21] == 'Friday':
frisales = frisales + float(row[11])
fricnt += 1
elif row[21] == 'Saturday':
satsales = satsales + float(row[11])
satcnt += 1
elif row[21] == 'Sunday':
sunsales = sunsales + float(row[11])
suncnt += 1
total = frisales + satsales + sunsales
totcnt = fricnt + satcnt + suncnt
print('3-day Total Sales:',#sum of sales would be here , '3-day Average Sale:', #count would be here)
I’ve tried recreating the function differently but I can seem to apply it to my logic and get it to work
If we assume that your CSV file looks something like this:
0,0,0,0,0,0,0,0,0,0,0,50,0,0,0,0,0,0,0,0,0,Saturday,0
0,0,0,0,0,0,0,0,0,0,0,12.5,0,0,0,0,0,0,0,0,0,Saturday,0
0,0,0,0,0,0,0,0,0,0,0,50,0,0,0,0,0,0,0,0,0,Friday,0
0,0,0,0,0,0,0,0,0,0,0,50,0,0,0,0,0,0,0,0,0,Saturday,0
0,0,0,0,0,0,0,0,0,0,0,50,0,0,0,0,0,0,0,0,0,Sunday,0
Then you could process it like this:
from collections import defaultdict
import csv
FILENAME = 'POS.csv'
D = defaultdict(lambda: [0,0.0])
DAYS = ['Friday', 'Saturday', 'Sunday']
def show(p, t, c):
print(f'{p} total={t:.2f} count={c} average={t/c:.2f}')
with open(FILENAME) as csvfile:
reader = csv.reader(csvfile, delimiter=',')
for row in reader:
k = row[21]
D[k][0] += 1
D[k][1] += float(row[11])
for day in DAYS:
if (count := D[day][0]) > 0:
show(day, D[day][1], count)
total = sum([D[k][1] for k in DAYS])
count = sum([D[k][0] for k in DAYS])
show('Three day', total, count)
This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()
So I got this error raised
ValueError: time data '8/16/2016 9:55' does not match format '%m/&d/%Y
%H:%M'.
I know that %m is the format for month with two digits (zero-padded). And as we can see that '8' (August) does not have zero padded. Is that the problem for this error? And how I fix this?
import datetime as dt
result_list = []
for a in ask_posts:
result_list.append([a[6], int(a[4])])
counts_by_hour = {}
comments_by_hour = {}
date_format = '%m/&d/%Y %H:%M'
for row in result_list:
date = row[0]
comment = row[1]
time = dt.datetime.strptime(date, date_format).strftime("%H")
``` I want to extract the Hour only```
if time not in counts_by_hour:
counts_by_hour[time] = 1
comments_by_hour[time] = comment
else:
counts_by_hour[time] += 1
comments_by_hours[time] += comment
you have an error in your dateformat % not &
import datetime as dt
result_list = []
for a in ask_posts:
result_list.append([a[6], int(a[4])])
counts_by_hour = {}
comments_by_hour = {}
date_format = '%m/%d/%Y %H:%M' # change & with %
for row in result_list:
date = row[0]
comment = row[1]
time = dt.datetime.strptime(date, date_format).strftime("%H")
``` I want to extract the Hour only```
if time not in counts_by_hour:
counts_by_hour[time] = 1
comments_by_hour[time] = comment
else:
counts_by_hour[time] += 1
comments_by_hours[time] += comment
I am trying to great an excel file that has is the combination of multiple excel files. However, when I copy a cell with a value 00:00, and append it to the master excel file, excel thinks the time is for the year 1899?
Here is my code:
def excel_graphs_all(day, users):
chart_wb = Workbook(write_only=True)
graph_ws = chart_wb.create_sheet(day + ' Graphs', 0)
chart_wb_filename = 'graphs_' + day + '.xlsx'
columnNum = ['A', 'H']
rowNum = 1
i = 0
for user in users:
filename = user[1] + '_' + day + '.xlsx'
iter_wb = load_workbook(filename=filename,read_only=True)
ws = iter_wb.active
chart_ws = chart_wb.create_sheet(user[1])
for row in ws.rows:
chart_ws.append([row[0].value, row[1].value])
chart = ScatterChart()
chart.title = user[1] + ' ' + day + ' Heartrate Data'
chart.x_axis.title = 'Time'
chart.y_axis.title = 'Heartrate'
chart.x_axis.scaling.min = 0
chart.x_axis.scaling.max = 1
xvalues = Reference(chart_ws, min_col=1, min_row=1, max_row= ws.max_row)
yvalues = Reference(chart_ws, min_col=2, min_row=1, max_row= ws.max_row)
series = Series(yvalues, xvalues, title='Heartrate')
chart.series.append(series)
spot = columnNum[i % 2]+str(rowNum)
graph_ws.add_chart(chart, spot)
if ((i+1)%2)== 0:
rowNum += 16
i += 1
chart_wb.save(chart_wb_filename)
return chart_wb_filename
Thanks!
What do you mean by value 00:00? Excel uses formatting and not typing for dates and times. From the specification:
When using the 1900 date system, which has a base date of 30th
December 1899, a serial date- time of 1.5 represents midday on the
31st December 1899
It sounds like you just need to check the formatting for the relevant cells.
My function is to read data from a file that consists of dates with times a tweet was written, and sentiments (good, bad or neutral) it's classified as; select date with times, and sentiments between a start and end date; and finally create three dictionaries (positive, negative and neutral) that use the date as key, and number of positive, negative or neutral tweets made in a day.
The problems I have are:
a) How do I get only date to display, and not date and time?.
b) How do I get my program to include both start and end date?
c) How do I separate a key and value with a semi-colon in a dictionary?
def get_sentiment_dates(start_date, end_date):
positive_dict = {}
negative_dict = {}
neutral_dict = {}
f = open("BAC2_answer.csv", "r")
tweets = f.readlines()
bin_use =[]
bin_trash =[]
bin_use_senti = []
bin_trash_senti = []
start_date_obj = datetime.strptime(start_date, '%Y-%m-%d')
end_date_obj = datetime.strptime(end_date, '%Y-%m-%d')
for i in tweets:
specs = i.split(',')
t_and_d = specs[0]
dt_obj = datetime.strptime(t_and_d, "%Y-%m-%d %H:%M:%S")
chars_body = specs[1].strip()
if ((dt_obj >= start_date_obj) and dt_obj <= (end_date_obj)):
bin_use.append(dt_obj)
bin_use_senti.append(chars_body)
else:
bin_trash.append(dt_obj)
bin_trash_senti.append(chars_body)
num_of_pos = 0
num_of_neg = 0
num_of_neut = 0
for i,j in zip(bin_use, bin_use_senti):
if j == 'Bullish':
num_of_pos +=1
positive_dict = (i, num_of_pos)
elif j == 'Bearish':
num_of_neg+=1
negative_dict = (i, num_of_neg)
else:
num_of_neut+=1
neutral_dict = (i, num_of_neut)
# print str(positive_dict) + "," + str(negative_dict) + "," + str(neutral_dict)
f.close()
return [positive_dict,negative_dict,neutral_dict]