How to use multiprocess to update pandas.dataframe in python? - python

The goal of the following code is to sort the big_df by timestamp and ordered by userid. Finally, I get the selected_df.
import pandas as pd
from collections import Counter
def init_process():
big_df = pd.Dataframe()
big_df = ... # It contains four columns and many rows.
big_df.columns = ['userid', 'itemid', 'ratings', 'timestamp']
users_list = Counter(big_df['userid'])
selected_df = pd.DataFrame() # it is an empty dataframe
for uid in users_list:
small_df = big_df [(big_df[0] == uid )]
new_user_items_df = small_df.sort_values(by=['timestamp'])
selected_df = selected_df.append(new_user_items_df, ignore_index=True)
if __name__ == '__main__':
init_process()
But when the big_df is so large, sometimes 80G, it will take many days to process it. Therefore, I want to use multiprocess to make it parallel. Then, I have the following code.
import pandas as pd
from collections import Counter
from multiprocessing import Pool
selected_df = pd.Dataframe()
big_df = pd.Dataframe()
def process_pd(uid):
global selected_df
small_df = big_df[(big_df[0] == uid)]
new_user_items_df = small_df.sort_values(by=['timestamp'])
selected_df = selected_df.append(new_user_items_df, ignore_index=True)
def init_process():
global big_df
big_df = ... # It contains four columns and many rows.
big_df.columns = ['userid', 'itemid', 'ratings', 'timestamp']
users_list = Counter(big_df['userid'])
selected_df = pd.DataFrame() # it is an empty dataframe
num_cpus = 5
process_pool = Pool(processes=num_cpus)
dfs = process_pool.map(process_pd, users_list)
print(selected_df)
if __name__ == '__main__':
init_process()
The selected_df is empty. I think maybe it is because the multiple processes do not update the global dataframe selected_df. But I am new of pandas, could anyone tell me how to modify this code?
Thank you very much.

Related

Read CSV file with Python function

I'm trying to write my read/write function to a csv, but it can't return any value.
I'm reading from a CSV, replacing the " ; " in the second column with " " and performing and saving the csv already handled.
But for some reason it doesn't save my csv, is my function wrong?
I'm starting out in the Python world, and I'm having a bit of trouble.
import pandas as pd
header_col = ['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';', names=header_col, header=None)
def file_load(df):
df['col1'] = df['col1'].str.replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=';', encoding='utf-8', index=False)
import pandas as pd
def file_load(df):
df['col1'] = str(df['col1']).replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=',', encoding='utf-8',
index=False)
def main():
header_col=
['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';',
names=header_col, header=None)
df1 = file_load(df)
save_file(df1)
if __name__ == '__main__':
main()

Read multiple file in python and generate one output

I have a python script for generating 1 upload file from 1 input file.
The thing is that the input files have started coming in batches, 30-50 at one time.
e.g.:
1111.xlsx --> upload.xlsx
1125.xlsx --> upload.xlsx
1176.xlsx --> upload.xlsx
1322.xlsx --> upload.xlsx
The code just converting the input files in the upload format.
Here's what I have done so far (1 input file -> 1 output file):
def main():
initial_workbook = 'C:/files/1111.xlsx'
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames , inplace=True)
initial_df.dropna(axis=1, how='all')
output = initial_df.to_excel('C:/files/upload_file.xlsx', index = False)
Is there a way to generate one upload file for all the files from the input folder. And once the files input files have been processed, rename them by prefixing x in front of it. e.g. x1111.xlsx
So here is how I will approach, for a given batch:
from datetime import datetime
import os
from pathlib import Path
all_dfs = []
proj_path = Path("C:/files/")
for f in os.listdir(proj_path):
if f.endswith(".xlsx"):
print(f"processing {f}...")
df_tmp = main(proj_path / f)
df_tmp["file_name"] = f
all_dfs.append(df_tmp)
df_all = pd.concat(all_dfs, axis=0)
df_all.to_excel(proj_path / f"{datetime.now()}_batch.xlsx", index = False)
def main(f):
initial_workbook = proj_path / f
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames, inplace=True)
initial_df.dropna(axis=1, how='all', inplace=True)
return initial_df
You can potentially enclose the logic for a batch in a function.

check if the csv file exists and do the condition?

Hi I am working on csv file and I have a data I want to append these data to the csv file. But firstly I want to check if the csv file exists if TRUE then just open the csv file and append the data to csv file and save it, if NOT just create a DataFrame and with these data and save it.
Note: I have a csv file in my I want to append the sample of data to my csv file
thanks in advance.
here is my trying.
#sample of data
ID = 5
img_Latitude = 38786454
img_Longitude = 1118468
meta_lat = 45778
meta_long = 886556
#create a function
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
#check if the file is exists, if True
if os.path.isfile('C:/My/Path/compare_coordinates.csv'):
#read the csv file
df = pd.read_csv('compare_coordinates.csv')
#make pd.series
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
#append the data to df
df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('C:/My/Path/compare_coordinates.csv', index=False)
The line df.append(data, ignore_index = True) needs to be:
df = df.append(data, ignore_index = True)
This is because DatFrame.append returns a new DF with the appended lines, it does not append in-place:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
To get the values that needed must be saved in variable so for the line
df.append(data, ignore_index = True) to be edited to df = df.append(data, ignore_index = True) and for the getting value of file exists or not as following codes:
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
Path = os.path.isfile('My/path/compare_coordinates1.csv')
if Path==True:
df = pd.read_csv('compare_coordinates1.csv')
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
df = df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('My/path/compare_coordinates1.csv', index=False)

How can I make my code recognize the function defined above

I am trying to run through a set of CSV files in order to compile a results CSV file. I'm getting an error that my function is undefined for some reason. Can you tell me why? Thanks.
def response_amp(data):
import pandas as pd
import numpy as np
#putting in and cutting out unnecessary parts of the data
df = pd.read_csv('data.csv', encoding = 'utf-8')
df = df[:-1]
a = df.columns[df.columns.str.startswith('ยต')]
df = df[a]
dfd = df.drop(df.index[:30]) #dropping the section with no sample
#splitting the data into chunks so response values can be measure
df1d = dfd[:320] #first interval
df2d = dfd[330:470] #second interval
df3d = dfd[480:] #third interval
#rolling avg on each
df1r = df1d.rolling(5, win_type='gaussian').sum(std=4)
df2r = df2d.rolling(5, win_type='gaussian').sum(std=4)
df3r = df3d.rolling(5, win_type='gaussian').sum(std=4)
bsln_1 = df1r.iloc[3:6].mean()
bsln_2 = df2r.iloc[3:6].mean()
bsln_3 = df3r.iloc[3:6].mean()
response_1 = abs(df1r.min()-bsln_1)/bsln_1
response_2 = abs(df1r.min()-bsln_2)/bsln_2
response_3 = abs(df1r.min()-bsln_3)/bsln_3
response = response_1,response_2,response_3
return(response)
import os
directory =(r'file directory goes here')
response = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
response.append(response_amp(filename))
a = numpy.asarray(response)
numpy.savetxt("ks_response.csv", a, delimiter=",")
Thanks for the help.

Separate Python web scraped data in different columns

I tried to scrape data by using API and put those result in an CSV file. But when I open my CSV file all the data is put together in 1 column(A). Instead I want the data to be separated in different columns(A & B (and C, D, E, F etc when I want to add info)). How can I do that?
import requests
import pandas as pd
from pandas.compat import StringIO
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
import csv
csv_file = open('/Users/katewang/Desktop/Test/scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
def get_EOD_data(api_token='5cb671b0b4a790.35526238', session = None, tickers = 'AAPL', start_date = dt.datetime(2018,1,1), end_date = dt.datetime(2018,12,31)):
symbols = tickers
if session is None:
session = requests.Session()
url = 'https://eodhistoricaldata.com/api/eod/%s.US' % symbols
params = {"api_token": api_token, "from": start_date, "to": end_date}
r = session.get(url, params = params)
if r.status_code == requests.codes.ok:
cols=[0,5]
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)
df.fillna(method = 'ffill', inplace = True)
df.fillna(method = 'bfill', inplace = True)
return df
def main():
df_data = get_EOD_data()
csv_writer.writerow([df_data])
if __name__ == '__main__':
main()
csv_file.close()
I expect to see two separate columns.
You're seeing only one column since, out of the two selected columns 0 and 5, you set column 0 to be the index when creating the dataframe. This leaves only column 5 as an actual column.
You can check for yourself by removing index_col = 0 from the line
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)

Categories

Resources