Im doing a scrape project from api url, scraping value(datetime,open,close,high,low) and write to mysql database
code
startTime_source = "https://api.binance.com/api/v1/klinesstartTime=0&symbol=btcusdt"&interval=1h&limit=1000"
startTime_source_get = requests.get(startTime_source)
startTime_source_json = json.loads(startTime_source_get.text)
#timestamp now
import time;
tsnw = time.time()*1000.0
tsnw = (int(tsnw))
for y in range(startTime_source_json[0][0],tsnw,3600000000): #scrape data from start to end
data = "https://api.binance.com/api/v1/klines?startTime=" + str(y) + "&symbol="+coin[x]+"&interval=1h&limit=1000"
data_get = requests.get(data)
#convert class response to json list
data_list = json.loads(data_get.text)
#loop for convert timestamp to datetime
for z in range(0,len(data_list)):
#delete all feature except dt_ which is after id=5
del data_list[z][5:]
#assign dt_ as timestamp which is id=1
timestamp = data_list[z][0]
#convert msec to sec
timestamp = time.localtime(timestamp / 1000.0)
#convert timestamp to datetime
dt = time.strftime("%Y-%m-%d %H:%M:%S", timestamp)
#replace value timestamp as dt
data_list[z][0] = dt
#seperate each feature
dt_ = [data_list[z][0]] #datetime
open_ = [data_list[z][1]] #open price
close_ = [data_list[z][4]] #close price
high_ = [data_list[z][2]] #high price
low_ = [data_list[z][3]] #low price
#cal the value of change and write to mysql database
num_1 = float(data_list[z-1][4])
num_2 = float(data_list[z][4])
#calculator formula
cal_change = ((num_2 - num_1) / num_1)*100
#round for 2 digit decimal ex 0.00
change_ = [round(cal_change,2)]
data_to_insert = [dt_+open_+close_+high_+low_+change_]
try:
#insert value
write_data = "INSERT INTO" + " " + coin[x] + "(dt_,open_,close_,high_,low_,change_) VALUES (%s,%s,%s,%s,%s,%s)"
mycursor.executemany(write_data,data_to_insert)
#it show error for multiple times
except mysql.connector.Error as err :
if err.sqlstate == "23000":
print('error')
pass
The problem is multiple rows has wrong calculation, example below dttime 2020-03-10 18:00:00 and the change value should be 0.33 not -18.21, why is that? help please, thank you.
I need to parse exactly those publications where the publication date coincides with today's date. I only need to match the day. For example, 20 = 20. Here's what I did, but this is not good code:
today = date.today()
d2 = today.strftime("%B %d, %Y")
today_day = d2[8] + d2[9]
for el in items:
title = el.select('.card-stats > div')
p = title[1].text
space = p.replace(" ","")
day = space[1] + space[2]
if day == today_day:
data_id = el.get('data-id')
I think that the mistake could be in the second part of your code where you throw a variable, day, you didn't define before.
Try:
today = date.today()
d2 = today.strftime("%B %d, %Y")
today_day = d2[8] + d2[9]
for el in items:
title = el.select('.card-stats > div')
p = title[1].text
space = p.replace(" ","")
day = space[1] + space[2]
if today == today_day:
data_id = el.get('data-id')
This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()
So I got this error raised
ValueError: time data '8/16/2016 9:55' does not match format '%m/&d/%Y
%H:%M'.
I know that %m is the format for month with two digits (zero-padded). And as we can see that '8' (August) does not have zero padded. Is that the problem for this error? And how I fix this?
import datetime as dt
result_list = []
for a in ask_posts:
result_list.append([a[6], int(a[4])])
counts_by_hour = {}
comments_by_hour = {}
date_format = '%m/&d/%Y %H:%M'
for row in result_list:
date = row[0]
comment = row[1]
time = dt.datetime.strptime(date, date_format).strftime("%H")
``` I want to extract the Hour only```
if time not in counts_by_hour:
counts_by_hour[time] = 1
comments_by_hour[time] = comment
else:
counts_by_hour[time] += 1
comments_by_hours[time] += comment
you have an error in your dateformat % not &
import datetime as dt
result_list = []
for a in ask_posts:
result_list.append([a[6], int(a[4])])
counts_by_hour = {}
comments_by_hour = {}
date_format = '%m/%d/%Y %H:%M' # change & with %
for row in result_list:
date = row[0]
comment = row[1]
time = dt.datetime.strptime(date, date_format).strftime("%H")
``` I want to extract the Hour only```
if time not in counts_by_hour:
counts_by_hour[time] = 1
comments_by_hour[time] = comment
else:
counts_by_hour[time] += 1
comments_by_hours[time] += comment
I have this code in Python:
import datetime
import re
import pymongo
from datetime import timedelta, date
def daterange(d, d1):
for n in range(int ((d1 - d).days)):
yield d + timedelta(n)
#conect to db
uri = "mongodb://127.0.0.1:27017"
client = pymongo.MongoClient(uri)
database = client['db']
collection = database['currency']
d = input('Insert beginning date (yyyy-mm-dd): ')
d1 = input('Insert end date (yyyy-mm-dd): ')
#search db
item = collection.find_one({"date" : d})
item1 = collection.find_one({"date" : d1})
datas = item['date']
datas1 = item1['date']
#convert string to object
dataObject = datetime.datetime.strptime(datas, "%Y-%m-%d")
dataObject1 = datetime.datetime.strptime(datas1, "%Y-%m-%d")
#range
mylist = []
for single_date in daterange(dataObject, dataObject1):
mylist.append(single_date.strftime("%Y-%m-%d"))
print(single_date.strftime("%Y-%m-%d"))
print(mylist)
item = collection.find_one({"date" : mylist[0]})
print(item)
If a user inserts a beginning date like 2018-05-07 and an end date like 2018-05-11 it will print:
2018-05-07
2018-05-08
2018-05-09
2018-05-10
In this case it will only print until the 10th day, how should I do to print also the end date (2018-05-11)?
There are many solutions to your question, however, I believe the easiest would be to adjust the daterange(d, d1) function, by simply adding 1 to the range(int ((d1 - d).days)).
def daterange(d, d1):
for n in range(int ((d1 - d).days) + 1):
yield d + timedelta(n)
The reason for this is that, as per documentation, range(stop) does not output the stop value, but only the values 'before' it.