While and Append in pandas python - python

I am trying to Call Api in a while loop and append the dataframe. But it is not appending .
#Max timestamp
MaxTs = 1635876000
api_key = "api_key"
cnt = 0
while cnt < 4:
url = f"https://min-api.cryptocompare.com/data/v2/histohour?fsym=BTC&tsym=USD&limit=2000&toTs={MaxTs}&api_key={api_key}"
r = requests.get(url)
data = r.json()
price_df = pd.DataFrame(data['Data']['Data'])
i = 0
reccnt = 2000
while i < reccnt:
currTs = price_df.iloc[i]['time']
if currTs < MaxTs:
MaxTs = currTs
i = i + 1
if cnt == 0:
#Copying the Orginal df to new df.
newdf = price_df.copy()
else:
#when counter increases append the df.
newdf.append(price_df)
print(MaxTs)
cnt = cnt + 1

You should increase cnt inside the while loop, not outside.
But after you perform a correction you will get several copies of the same price_df. Is that what you are trying to get?

Related

For pandas query is very slow, How to enhancing performance?

How to quickly find the data that meets the conditions and terminate the subsequent search in advance.Search from right to front until the maximum findlimit of the search.
Expected accuracy <= 0.001s,
Current accuracy >= 0.03s
def test_Find(df:pd.DataFrame,findLimit:int=365):
success = False
current = pd.DataFrame()
start = len(df) -1
loopCount = 0
# Compare from back to front
for i in range(start, -1, -1):
loopCount += 1
if loopCount > findLimit:
return pd.DataFrame()
current = df.iloc[i]
success = Find(i, df)
if success:
return current
return pd.DataFrame()
#Whether the last value is greater than the current value
def LastPassCurrent(curi:int, df:pd.DataFrame):
current = df.iloc[curi]
last = df.iloc[-1]
result = last.c > current["c"]
return result
# Whether the current residual value is Less than or equal to the current value
def RemainNoPassCurrent(curi:int, df:pd.DataFrame)->bool:
cur = df.iloc[curi]
remain = df.iloc[curi+1:-1]
maxC = remain["c"].max()
if np.isnan(maxC):
maxC = 0
remainNoPassCurrent = maxC <= cur["c"]
return remainNoPassCurrent
# Qualified search
def Find(curi, df):
current = df.iloc[curi]
result = (current.a == 8) and \
RemainNoPassCurrent(curi, df) and \
LastPassCurrent(curi, df)
return result
#test data
dfs = []
for i in range(0, 4000):
dfs.append(pd.DataFrame(np.arange(365*3).reshape(365,3), columns=list('abc')))
#Test time collection
df = None
for i in range(0, 4000):
df = dfs[i]
start_time = time.time()
data = test_Find(df, 365)
end_time = time.time()
result = end_time - start_time
print(f'loop {i} Empty:{data.empty} time is %.3fs' % result)
#Current testing inefficient time
# loop 0 time is 0.367s ....

Split json file into multiple csv files depending on date?

I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1

The while loop cannot progress pass i = 1

I have a small algorithm like this one.
I coded it in python
import pandas as pd
raw = pd.read_csv
i = 0
T = pd.DataFrame(columns = ['Values'])
singleT = raw.mean() + raw.std()
T = T.append(singleT, ignore_index=True)
if i == 0:
raw = raw.where(raw<T.iloc[i,:])
i += 1
while True:
singleT = raw.mean() + raw.std()
T = T.append(singleT, ignore_index=True)
if T.iloc[i,:].values == T.iloc[i-1,:].values:
break
background = T.iloc[i,:].values
else:
raw = raw.where(raw<T.iloc[i,:])
i += 1
print ('iteration{:02}'.format(i))
However, the loop didn't get pass i = 1 and keep repeating, the whole T array is filled with value when i = 1. I tried several modifications to my code but they also didn't work at all.
Any advice on how to fix this problem would be appreciated!
Thank you very much
Edit: I have inserted one tab for the last 4 lines, as you guys suggested to make sure the else belong to while, but now it has syntax invalid error
Edit2: Here is the correct code for this problem, it will not enter the infinite loop.
i = 0
T = pd.DataFrame(columns = ['Pulse counts'])
singleT = raw.mean() + raw.std()
T = T.append(singleT, ignore_index=True)
if i == 0:
filtered = raw.where(raw<T.iloc[i,:])
i += 1
while True:
singleT = filtered.mean() + filtered.std()
T = T.append(singleT, ignore_index=True)
if T.iloc[i,:].values == T.iloc[i-1,:].values or T.iloc[i-1,:].values == 0:
background = T.iloc[i-1,:].values
break
else:
filtered = filtered.where(filtered<T.iloc[i,:])
print ('iteration{}'.format(i))
i += 1
Try this way :
import pandas as pd
raw = pd.read_csv
i = 0
T = pd.DataFrame(columns = ['Values'])
singleT = raw.mean() + raw.std()
T = T.append(singleT, ignore_index=True)
while True:
singleT = raw.mean() + raw.std()
T = T.append(singleT, ignore_index=True)
if T.iloc[i,:].values == T.iloc[i-1,:].values:
background = T.iloc[i,:].values
break
else:
raw = raw.where(raw<T.iloc[i,:])
#i += 1
print ('iteration{:02}'.format(i))
i += 1
if your indentation is correct, the else part:
else:
raw = raw.where(raw<T.iloc[i,:])
i += 1
print ('iteration{:02}'.format(i))
doesn't belong to the if but to the while, which has a different meaning: it is executed only if loop ends without a break (less-known feature, also available with for which saves the need to define a flag when break has been reached)
So i is never incremented: infinite loop.

Python Pandas How to save output to csv

Hello now im working on my project. I want to get candidate of text block by using algorithm below.
My input is a csv document which contain :
HTML column : the html code in a line
TAG column : the tag of html code in a line
Words : the text inside the tag in aline
TC : the number of words in a line
LTC : the number of anchor words in a line
TG : the number of tag in a line
P : the number of tag p and br in a line
CTTD : TC + (0.2*LTC) + TG - P
CTTDs : the smoothed CTTD
This is my algorithm to find candidate of text block. I make the csv file into dataframe using pandas. I am using CTTDs,TC and TG column to find the candidate.
from ListSmoothing import get_filepaths_smoothing
import pandas as pd
import numpy as np
import csv
filenames = get_filepaths_smoothing(r"C:\Users\kimhyesung\PycharmProjects\newsextraction\smoothing")
index = 0
for f in filenames:
file_html=open(str(f),"r")
df = pd.read_csv(file_html)
#df = pd.read_csv('smoothing/Smoothing001.csv')
news = np.array(df['CTTDs'])
new = np.array(df['TG'])
minval = np.min(news[np.nonzero(news)])
maxval = np.max(news[np.nonzero(news)])
j = 0.2
thetaCTTD = minval + j * (maxval-minval)
#maxGap = np.max(new[np.nonzero(new)])
#minGap = np.min(new[np.nonzero(new)])
thetaGap = np.min(new[np.nonzero(new)])
#print thetaCTTD
#print maxval
#print minval
#print thetaGap
def create_candidates(df, thetaCTTD, thetaGAP):
k = 0
TB = {}
TC = 0
for index in range(0, len(df) - 1):
start = index
if df.ix[index]['CTTDs'] > thetaCTTD:
start = index
gap = 0
TC = df.ix[index]['TC']
for index in range(index + 1, len(df) - 1):
if df.ix[index]['TG'] == 0:
continue
elif df.ix[index]['CTTDs'] <= thetaCTTD and gap >= thetaGAP:
break
elif df.ix[index]['CTTDs'] <= thetaCTTD:
gap += 1
TC += df.ix[index]['TC']
if (TC < 1) or (start == index):
continue
TB.update({
k: {
'start': start,
'end': index - 1
}
})
k += 1
return TB
def get_unique_candidate(TB):
TB = tb.copy()
for key, value in tb.iteritems():
if key == len(tb) - 1:
break
if value['end'] == tb[key+1]['end']:
del TB[key+1]
elif value['start'] < tb[key+1]['start'] < value['end']:
TB[key]['end'] = tb[key+1]['start'] - 1
else:
continue
return TB
index += 1
stored_file = "textcandidate/textcandidate" + '{0:03}'.format(index) + ".csv"
tb = create_candidates(df, thetaCTTD, thetaGap)
TB = get_unique_candidate(tb)
filewrite = open(stored_file, "wb")
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
output_df.to_csv(stored_file)
writer = csv.writer(filewrite, lineterminator='\n')
filewrite.close
ThetaCTTD is 10.36 and thethaGap is 1.
The output is
The output means there are 2 candidates of text block . First the candiate of text block start from line number 215 and end line number 225 (like the pict bellow). And the other candidate of text block start from line number 500 and end line number 501.
My question is how to save the output into csv and not only the number of line but the range of the text block and the others column will appear as the output too?
My expected output is like the screenshot of candidate text block is like this one
Assuming your output is a list of dictionaries:
pd.concat([df.loc[d['start']:d['end']] for (k, d) in TB.iteritems()])
Note that we slice by label, so d['end'] will be included.
Edit: add the candidate number in a new column.
It's cleaner to write a loop than to do two concat operations:
df_list = []
for (k, d) in TB.iteritems():
candidate_df = df.loc[d['start']:d['end']]
candidate_df['candidate'] = k
df_list.append(candidate_df)
output_df = pd.concat(df_list)
It's also faster to concatenate all dataframes at once at the end.

Why is my code looping at just one line in the while loop instead over the whole block?

Sorry for the unsophisticated question title but I need help desperately:
My objective at work is to create a script that pulls all the records from exacttarget salesforce marketing cloud API. I have successfully setup the API calls, and successfully imported the data into DataFrames.
The problem I am running into is two-fold that I need to keep pulling records till "Results_Message" in my code stops reading "MoreDataAvailable" and I need to setup logic which allows me to control the date from either within the API call or from parsing the DataFrame.
My code is getting stuck at line 44 where "print Results_Message" is looping around the string "MoreDataAvailable"
Here is my code so far, on lines 94 and 95 you will see my attempt at parsing the date directly from the dataframe but no luck and no luck on line 32 where I have specified the date:
import ET_Client
import pandas as pd
AggreateDF = pd.DataFrame()
Data_Aggregator = pd.DataFrame()
#Start_Date = "2016-02-20"
#End_Date = "2016-02-25"
#retrieveDate = '2016-07-25T13:00:00.000'
Export_Dir = 'C:/temp/'
try:
debug = False
stubObj = ET_Client.ET_Client(False, debug)
print '>>>BounceEvents'
getBounceEvent = ET_Client.ET_BounceEvent()
getBounceEvent.auth_stub = stubObj
getBounceEvent.search_filter = {'Property' : 'EventDate','SimpleOperator' : 'greaterThan','Value' : '2016-02-22T13:00:00.000'}
getResponse1 = getBounceEvent.get()
ResponseResultsBounces = getResponse1.results
Results_Message = getResponse1.message
print(Results_Message)
#EventDate = "2016-05-09"
print "This is orginial " + str(Results_Message)
#print ResponseResultsBounces
i = 1
while (Results_Message == 'MoreDataAvailable'):
#if i > 5: break
print Results_Message
results1 = getResponse1.results
#print(results1)
i = i + 1
ClientIDBounces = []
partner_keys1 = []
created_dates1 = []
modified_date1 = []
ID1 = []
ObjectID1 = []
SendID1 = []
SubscriberKey1 = []
EventDate1 = []
EventType1 = []
TriggeredSendDefinitionObjectID1 = []
BatchID1 = []
SMTPCode = []
BounceCategory = []
SMTPReason = []
BounceType = []
for BounceEvent in ResponseResultsBounces:
ClientIDBounces.append(str(BounceEvent['Client']['ID']))
partner_keys1.append(BounceEvent['PartnerKey'])
created_dates1.append(BounceEvent['CreatedDate'])
modified_date1.append(BounceEvent['ModifiedDate'])
ID1.append(BounceEvent['ID'])
ObjectID1.append(BounceEvent['ObjectID'])
SendID1.append(BounceEvent['SendID'])
SubscriberKey1.append(BounceEvent['SubscriberKey'])
EventDate1.append(BounceEvent['EventDate'])
EventType1.append(BounceEvent['EventType'])
TriggeredSendDefinitionObjectID1.append(BounceEvent['TriggeredSendDefinitionObjectID'])
BatchID1.append(BounceEvent['BatchID'])
SMTPCode.append(BounceEvent['SMTPCode'])
BounceCategory.append(BounceEvent['BounceCategory'])
SMTPReason.append(BounceEvent['SMTPReason'])
BounceType.append(BounceEvent['BounceType'])
df1 = pd.DataFrame({'ClientID': ClientIDBounces, 'PartnerKey': partner_keys1,
'CreatedDate' : created_dates1, 'ModifiedDate': modified_date1,
'ID':ID1, 'ObjectID': ObjectID1,'SendID':SendID1,'SubscriberKey':SubscriberKey1,
'EventDate':EventDate1,'EventType':EventType1,'TriggeredSendDefinitionObjectID':TriggeredSendDefinitionObjectID1,
'BatchID':BatchID1,'SMTPCode':SMTPCode,'BounceCategory':BounceCategory,'SMTPReason':SMTPReason,'BounceType':BounceType})
#print df1
#df1 = df1[(df1.EventDate > "2016-02-20") & (df1.EventDate < "2016-02-25")]
#AggreateDF = AggreateDF[(AggreateDF.EventDate > Start_Date) and (AggreateDF.EventDate < End_Date)]
print(df1['ID'].max())
AggreateDF = AggreateDF.append(df1)
print(AggreateDF.shape)
#df1 = df1[(df1.EventDate > "2016-02-20") and (df1.EventDate < "2016-03-25")]
#AggreateDF = AggreateDF[(AggreateDF.EventDate > Start_Date) and (AggreateDF.EventDate < End_Date)]
print("Final Aggregate DF is: " + str(AggreateDF.shape))
#EXPORT TO CSV
AggreateDF.to_csv(Export_Dir +'DataTest1.csv')
#with pd.option_context('display.max_rows',10000):
#print (df_masked1.shape)
#print df_masked1
except Exception as e:
print 'Caught exception: ' + str(e.message)
print e
Before my code parses the data, the orginal format I get of the data is a SOAP response, this is what it look like(below). Is it possible to directly parse records based on EventDate from the SOAP response?
}, (BounceEvent){
Client =
(ClientID){
ID = 1111111
}
PartnerKey = None
CreatedDate = 2016-05-12 07:32:20.000937
ModifiedDate = 2016-05-12 07:32:20.000937
ID = 1111111
ObjectID = "1111111"
SendID = 1111111
SubscriberKey = "aaa#aaaa.com"
EventDate = 2016-05-12 07:32:20.000937
EventType = "HardBounce"
TriggeredSendDefinitionObjectID = "aa111aaa"
BatchID = 1111111
SMTPCode = "1111111"
BounceCategory = "Hard bounce - User Unknown"
SMTPReason = "aaaa"
BounceType = "immediate"
Hope this makes sense, this is my desperately plea for help.
Thank you in advance!
You don't seem to be updating Results_Message in your loop, so it's always going to have the value it gets in line 29: Results_Message = getResponse1.message. Unless there's code involved that you didn't share, that is.

Categories

Resources