import pandas as pd
import numpy as np
import datetime
import schedule
import time
ticks = api.ticks(api.Contracts.Stocks["2330"], "2022-08-09")
df = pd.DataFrame({**ticks})
df.ts = pd.to_datetime(df.ts)
df = df[df.volume>200]
df
Above code, works fine. I got data.
Below code, not working. I got nothing. It just keep running but no data coming.
My goal is to run the code (receive data), every 2 mins automatically.
I counldnt figure out where go wrong.
I would need some help. tried many times and spent a lot of time.
import pandas as pd
import numpy as np
import datetime
import schedule
import time
def show_datafr():
ticks = api.ticks(api.Contracts.Stocks["2330"], "2022-08-09")
df = pd.DataFrame({**ticks})
df.ts = pd.to_datetime(df.ts)
df = df[df.volume>200]
df
schedule.every(4).seconds.do(show_datafr)
while 1:
schedule.run_pending()
time.sleep(1)
To display df you can import display from IPython.display
You might want to install it with pip install ipython incase you don't have it installed.
import pandas as pd
import numpy as np
import datetime
from schedule
import time
from IPython.display import display # Additional import
def show_datafr():
ticks = api.ticks(api.Contracts.Stocks["2330"], "2022-08-09")
df = pd.DataFrame({**ticks})
df.ts = pd.to_datetime(df.ts)
df = df[df.volume>200]
display(df) # To display dataframe
schedule.every(2).minutes.do(show_datafr) # Remember you said every 2 minutes
while True:
schedule.run_pending()
time.sleep(1)
if you want to run every 2 min, the schedule line is quite strange.
it should be:
schedule.every(2).minutes.do(show_datafr)
instead of:
schedule.every(4).seconds.do(show_datafr)
as what you wrote is to run every 4 sec, and possibly the operation cannot be finished in 4 sec, causing it no output
Related
I want to calculate some data with pyIGRF library but when I export it to csv all data have same parameters.
import pyIGRF
import os
import numpy as np
import pandas as pd
os.chdir('D:/IGRF')
df=pd.read_csv('igrf.csv')
print(df)
df.head()
for i in range(0,len(df)):
x=[Inc,Dec,Hi,Xn,Yn,Zn,totalmag]=pyIGRF.igrf_value(df['lan'][i],df['lat'][i],df['alt'][i],2022)
for j in x:
df['Inc']=Inc
df['Dec']=Dec
df['total']=totalmag
print(df)
import csv
df.to_csv('IGRF_end.csv')
I think loop needs some changes, but I couldn't find this changes.
One problem that I see when looking at the pyIGRF documentation is that you've got the order of the inputs wrong. It should be:
pyIGRF.igrf_variation(lat, lon, alt, date)
And you've switched the latitude and longitude.
How can i print a new dataframe and clear the last printed dataframe while using a loop?
So it wont show all dataframes just the last one in the output?
Using print(df, end="\r") doesn't work
import pandas as pd
import numpy as np
while True:
df = pd.DataFrame(np.random.rand(10,10))
print(df)
If i get live data from an api to insert into the df, i'll use the while loop to constantly update the data. But how can i print only the newest dataframe instead of printing all the dataframes underneath each other in the output?
If i use the snippet below it does work, but i think there should be a more elegant solution.
import pandas as pd
import numpy as np
Height_DF = 10
Width_DF = 10
while True:
df = pd.DataFrame(np.random.rand(10,10))
print(df)
for i in range(Height_DF + 1):
sys.stdout.write("\033[F")
try this:
import pandas as pd
import numpy as np
import time
import sys
while True:
df = pd.DataFrame(np.random.rand(10,10))
print(df)
sys.stdout.write("\033[F")
time.sleep(1)
Suppose I have a list of api keys I am downloading from the census data
Example:
variable_list = [
'B08006_017E',
'B08016_002E',
'B08016_003E',
'B08016_004E',
...
]
Now given memory constraints for putting this data onto one csv file. I want to create a way in which I place blocks of 100 variables from the variable list onto a number of csv files. For example, if I have 200 variables than I would have 2 csv files of the first 100 and one with the second 100 varaibles. I hope that is clear.
This is how I am currently downloading the data:
import pandas as pd
import censusdata
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 2)
#import statsmodels.formula.api as sm
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import censusgeocode as cg
import numpy as np
from numbers import Number
import plotly
import matplotlib.pyplot as plt
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import requests
import pandas
import geopandas
import json
import math
from haversine import haversine
from ipfn import ipfn
import networkx
from matplotlib import pyplot
from matplotlib import patheffects
from shapely.geometry import LineString, MultiLineString
variable_list1 = [
'B08006_017E',
'B08016_002E'
'B08016_003E',
'B08016_004E'
]
all_variable_lists = [variable_list1]
print(len(all_variable_lists[0]))
#2) For each year, download the relevant variables for each tract
def download_year(year,variable_list,State,County,Tract):
df = censusdata.download('acs5', year, censusdata.censusgeo([('state',State),('county',County),('tract',Tract)]), variable_list, key = 'e39a53c23358c749629da6f31d8f03878d4088d6')
df['Year']=str(year)
return df
#3) Define function to download for a single year and state
def callback_arg(i,variable_list,year):
try:
print('Downloading - ',year,'State', i,' of 57')
if i<10:
df = download_year(year,variable_list,'0'+str(i),'*','*')
return df
if i==51:
df = download_year(year,variable_list,str(i),'*','*')
return df
else:
df = download_year(year,variable_list,str(i),'*','*')
return df
except:
pass
#3) Function to download for all states and all years, do some slight formatting
def download_all_data(variable_list,max_year):
df=download_year(2012,variable_list,'01','*','*')
for year in range(2012,max_year+1):
if year == 2012:
for i in range(0,57):
df=df.append(callback_arg(i,variable_list,year))
else:
for i in range(0,57):
df=df.append(callback_arg(i,variable_list,year))
df2=df.reset_index()
df2=df2.rename(columns = {"index": "Location+Type"}).astype(str)
df2['state']=df2["Location+Type"].str.split(':').str[0].str.split(', ').str[2]
df2['Census_tract']=df2["Location+Type"].str.split(':').str[0].str.split(',').str[0].str.split(' ').str[2][0]
df2['County_name']=df2["Location+Type"].str.split(':').str[0].str.split(', ').str[1]
return(df2)
#4) Some slight formatting
def write_to_csv(df2,name = 'test'):
df2.to_csv(name)
#5) The line below is commented out, but should run the entire download sequence
def write_to_csv(df, ide):
df.to_csv('test' + str(ide) + '.csv')
list_of_dfs = []
for var_list in all_variable_lists:
list_of_dfs.append(download_all_data(var_list, 2012))
x1 = list_of_dfs[0].reset_index()
# x3 = pd.merge(x1,x2, on=['index','Location+Type','Year','state','Census_tract','County_name'])
write_to_csv(x1,1)
If anyone can give me some ideas on how to achieve what I want this would greatly help me. Thank you.
It looks like you're already chunking the variable_lists here:
for var_list in all_variable_lists:
list_of_dfs.append(download_all_data(var_list, 2012))
Just make sure each var_list has only 100 items. Then chunk the csv writing in the same way, using enumerate to increment the index for filename:
for index, out_list in enumerate(list_of_dfs):
write_to_csv(out_list.reset_index(),index)
If you're just looking to break up the final output at write time:
for index, out_list in enumerate(np.array_split(x1, 100)):
write_to_csv(out_list,index)
import numpy as np
import pandas as pd
from multiprocessing import Pool
import threading
#Load the data
df = pd.read_csv('crsp_short.csv', low_memory=False)
def funk(date):
...
# for each date in df.date.unique() do stuff which gives sample dataframe
# as an output
#then write it to file
sample.to_csv('crsp_full.csv', mode='a')
def evaluation(f_list):
with futures.ProcessPoolExecutor() as pool:
return pool.map(funk, f_list)
# list_s is a list of dates I want to calculate function funk for
evaluation(list_s)
I get a csv file as an output with some of the lines messed up because python is writing some pieces from different threads at the same time. I guess I need to use Queues, but I was not able to modify the code so that it worked. Ideas how to do it?Otherwise it takes ages to get the results.
That solved the problem (Pool does the queue for you)
Python: Writing to a single file with queue while using multiprocessing Pool
My version of the code that didn't mess up the output csv file:
import numpy as np
import pandas as pd
from multiprocessing import Pool
import threading
#Load the data
df = pd.read_csv('crsp_short.csv', low_memory=False)
def funk(date):
...
# for each date in df.date.unique() do stuff which gives sample dataframe
# as an output
return sample
# list_s is a list of dates I want to calculate function funk for
def mp_handler():
# 28 is a number of processes I want to run
p = multiprocessing.Pool(28)
for result in p.imap(funk, list_s):
result.to_csv('crsp_full.csv', mode='a')
if __name__=='__main__':
mp_handler()
from https://pypi.org/project/tqdm/:
import pandas as pd
import numpy as np
from tqdm import tqdm
df = pd.DataFrame(np.random.randint(0, 100, (100000, 6)))
tqdm.pandas(desc="my bar!")p`
df.progress_apply(lambda x: x**2)
I took this code and edited it so that I create a DataFrame from load_excel rather than using random numbers:
import pandas as pd
from tqdm import tqdm
import numpy as np
filename="huge_file.xlsx"
df = pd.DataFrame(pd.read_excel(filename))
tqdm.pandas()
df.progress_apply(lambda x: x**2)
This gave me an error, so I changed df.progress_apply to this:
df.progress_apply(lambda x: x)
Here is the final code:
import pandas as pd
from tqdm import tqdm
import numpy as np
filename="huge_file.xlsx"
df = pd.DataFrame(pd.read_excel(filename))
tqdm.pandas()
df.progress_apply(lambda x: x)
This results in a progress bar, but it doesn't actually show any progress, rather it loads the bar, and when the operation is done it jumps to 100%, defeating the purpose.
My question is this: How do I make this progress bar work?
What does the function inside of progress_apply actually do?
Is there a better approach? Maybe an alternative to tqdm?
Any help is greatly appreciated.
Will not work. pd.read_excel blocks until the file is read, and there is no way to get information from this function about its progress during execution.
It would work for read operations which you can do chunk wise, like
chunks = []
for chunk in pd.read_csv(..., chunksize=1000):
update_progressbar()
chunks.append(chunk)
But as far as I understand tqdm also needs the number of chunks in advance, so for a propper progress report you would need to read the full file first....
The following is a one-liner solution utilizing tqdm:
import pandas as pd
from tqdm import tqdm
df = pd.concat([chunk for chunk in tqdm(pd.read_csv(file_name, chunksize=1000), desc='Loading data')])
If you know the total lines to be loaded, you can add that information with the parameter total to the tqdm fuction, resulting in a percentage output.
This might help for people with similar problem.
here you can get help
for example:
for i in tqdm(range(0,3), ncols = 100, desc ="Loading data.."):
df=pd.read_excel("some_file.xlsx",header=None)
LC_data=pd.read_excel("some_file.xlsx",'Sheet1', header=None)
FC_data=pd.read_excel("some_file.xlsx",'Shee2', header=None)
print("------Loading is completed ------")
DISCLAIMER: This works only with xlrd engine and is not thoroughly tested!
How it works? We monkey-patch xlrd.xlsx.X12Sheet.own_process_stream method that is responsible to load sheets from file-like stream. We supply own stream, that contains our progress bar. Each sheet has it's own progress bar.
When we want the progress bar, we use load_with_progressbar() context manager and then do pd.read_excel('<FILE.xlsx>').
import xlrd
from tqdm import tqdm
from io import RawIOBase
from contextlib import contextmanager
class progress_reader(RawIOBase):
def __init__(self, zf, bar):
self.bar = bar
self.zf = zf
def readinto(self, b):
n = self.zf.readinto(b)
self.bar.update(n=n)
return n
#contextmanager
def load_with_progressbar():
def my_get_sheet(self, zf, *other, **kwargs):
with tqdm(total=zf._orig_file_size) as bar:
sheet = _tmp(self, progress_reader(zf, bar), **kwargs)
return sheet
_tmp = xlrd.xlsx.X12Sheet.own_process_stream
try:
xlrd.xlsx.X12Sheet.own_process_stream = my_get_sheet
yield
finally:
xlrd.xlsx.X12Sheet.own_process_stream = _tmp
import pandas as pd
with load_with_progressbar():
df = pd.read_excel('sample2.xlsx')
print(df)
Screenshot of progress bar:
The following is based on user's rocksportrocker excellent answer.
I am a Python beginner!
Below, please find my first version of using user rocksportrocker's recommendation.
import pandas as pd
print("Info: Loading starting.")
# https://stackoverflow.com/questions/52209290
temp = [];
myCounter = 1;
myChunksize = 10000;
# https://stackoverflow.com/questions/24251219/
for myChunk in pd.read_csv('YourFileName.csv', chunksize = myChunksize, low_memory = False):
print('# of rows processed: ', myCounter*myChunksize)
myCounter = myCounter + 1;
temp.append(myChunk)
print("Info: Loading complete.")
# https://stackoverflow.com/questions/33642951
df = pd.concat(temp, ignore_index = True)
df.head()