Separate Python web scraped data in different columns - python

I tried to scrape data by using API and put those result in an CSV file. But when I open my CSV file all the data is put together in 1 column(A). Instead I want the data to be separated in different columns(A & B (and C, D, E, F etc when I want to add info)). How can I do that?
import requests
import pandas as pd
from pandas.compat import StringIO
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
import csv
csv_file = open('/Users/katewang/Desktop/Test/scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
def get_EOD_data(api_token='5cb671b0b4a790.35526238', session = None, tickers = 'AAPL', start_date = dt.datetime(2018,1,1), end_date = dt.datetime(2018,12,31)):
symbols = tickers
if session is None:
session = requests.Session()
url = 'https://eodhistoricaldata.com/api/eod/%s.US' % symbols
params = {"api_token": api_token, "from": start_date, "to": end_date}
r = session.get(url, params = params)
if r.status_code == requests.codes.ok:
cols=[0,5]
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)
df.fillna(method = 'ffill', inplace = True)
df.fillna(method = 'bfill', inplace = True)
return df
def main():
df_data = get_EOD_data()
csv_writer.writerow([df_data])
if __name__ == '__main__':
main()
csv_file.close()
I expect to see two separate columns.

You're seeing only one column since, out of the two selected columns 0 and 5, you set column 0 to be the index when creating the dataframe. This leaves only column 5 as an actual column.
You can check for yourself by removing index_col = 0 from the line
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)

Related

How to use multiprocess to update pandas.dataframe in python?

The goal of the following code is to sort the big_df by timestamp and ordered by userid. Finally, I get the selected_df.
import pandas as pd
from collections import Counter
def init_process():
big_df = pd.Dataframe()
big_df = ... # It contains four columns and many rows.
big_df.columns = ['userid', 'itemid', 'ratings', 'timestamp']
users_list = Counter(big_df['userid'])
selected_df = pd.DataFrame() # it is an empty dataframe
for uid in users_list:
small_df = big_df [(big_df[0] == uid )]
new_user_items_df = small_df.sort_values(by=['timestamp'])
selected_df = selected_df.append(new_user_items_df, ignore_index=True)
if __name__ == '__main__':
init_process()
But when the big_df is so large, sometimes 80G, it will take many days to process it. Therefore, I want to use multiprocess to make it parallel. Then, I have the following code.
import pandas as pd
from collections import Counter
from multiprocessing import Pool
selected_df = pd.Dataframe()
big_df = pd.Dataframe()
def process_pd(uid):
global selected_df
small_df = big_df[(big_df[0] == uid)]
new_user_items_df = small_df.sort_values(by=['timestamp'])
selected_df = selected_df.append(new_user_items_df, ignore_index=True)
def init_process():
global big_df
big_df = ... # It contains four columns and many rows.
big_df.columns = ['userid', 'itemid', 'ratings', 'timestamp']
users_list = Counter(big_df['userid'])
selected_df = pd.DataFrame() # it is an empty dataframe
num_cpus = 5
process_pool = Pool(processes=num_cpus)
dfs = process_pool.map(process_pd, users_list)
print(selected_df)
if __name__ == '__main__':
init_process()
The selected_df is empty. I think maybe it is because the multiple processes do not update the global dataframe selected_df. But I am new of pandas, could anyone tell me how to modify this code?
Thank you very much.

Export JSON results to CSV using Pandas package

I am trying to modify the following code (I am newbie at python, so try to teach me step by step)
import requests, json
import pandas as pd
class AjaxScraper():
results = []
def fetch(self, url):
return requests.get(url)
def parse(self, content):
self.results = content['data']
for entry in self.results:
del entry['_id']
def to_csv(self):
df = pd.DataFrame(self.results)
pd.to_csv('Table.csv', sep=',', encoding='utf-8',index = False)
def start_me(self):
response = self.fetch('https://scrapingkungfu.herokuapp.com/api?_=1576384789999')
self.parse(response.json())
self.to_csv()
if __name__ == '__main__':
scraper = AjaxScraper()
scraper.start_me()
I have got errors like that
File "demo.py", line 24, in start_me
self.to_csv()
File "demo.py", line 19, in to_csv
pd.to_csv('Table.csv', sep=',', encoding='utf-8',index = False)
AttributeError: module 'pandas' has no attribute 'to_csv'
I wonder why this error appears although I saw many codes that has to_csv in pandas package..!!
** This is a simple dataframe that I need to learn how to reorder the columns using the index of columns
import pandas as pd
name_dict = {
'Name': ['a','b','c','d'],
'Score': [90,80,95,20]
}
df = pd.DataFrame(name_dict)
print (df)
to_csv is a method of a DataFrame object, not of the pandas module.
You need to create a dataframe
Reordering the Dataframe with your example
import pandas as pd
name_dict = {
'Name': ['a','b','c','d'],
'Score': [90,80,95,20]
}
df = pd.DataFrame(name_dict)
print (df)
The solution is creating a new data frame with our desired order
df = df[['Score', 'Name']]

Read multiple file in python and generate one output

I have a python script for generating 1 upload file from 1 input file.
The thing is that the input files have started coming in batches, 30-50 at one time.
e.g.:
1111.xlsx --> upload.xlsx
1125.xlsx --> upload.xlsx
1176.xlsx --> upload.xlsx
1322.xlsx --> upload.xlsx
The code just converting the input files in the upload format.
Here's what I have done so far (1 input file -> 1 output file):
def main():
initial_workbook = 'C:/files/1111.xlsx'
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames , inplace=True)
initial_df.dropna(axis=1, how='all')
output = initial_df.to_excel('C:/files/upload_file.xlsx', index = False)
Is there a way to generate one upload file for all the files from the input folder. And once the files input files have been processed, rename them by prefixing x in front of it. e.g. x1111.xlsx
So here is how I will approach, for a given batch:
from datetime import datetime
import os
from pathlib import Path
all_dfs = []
proj_path = Path("C:/files/")
for f in os.listdir(proj_path):
if f.endswith(".xlsx"):
print(f"processing {f}...")
df_tmp = main(proj_path / f)
df_tmp["file_name"] = f
all_dfs.append(df_tmp)
df_all = pd.concat(all_dfs, axis=0)
df_all.to_excel(proj_path / f"{datetime.now()}_batch.xlsx", index = False)
def main(f):
initial_workbook = proj_path / f
temp_df = pd.ExcelFile(initial_workbook)
initial_df = pd.read_excel(initial_workbook, sheet_name = "default")
#drop first 4 rows to set header
new_header = initial_df.iloc[2]
initial_df = initial_df.iloc[3:]
initial_df.columns = new_header
#drop all rows with no data
indexNames = initial_df[initial_df['grade'] == 'select'].index
initial_df.drop(indexNames, inplace=True)
initial_df.dropna(axis=1, how='all', inplace=True)
return initial_df
You can potentially enclose the logic for a batch in a function.

check if the csv file exists and do the condition?

Hi I am working on csv file and I have a data I want to append these data to the csv file. But firstly I want to check if the csv file exists if TRUE then just open the csv file and append the data to csv file and save it, if NOT just create a DataFrame and with these data and save it.
Note: I have a csv file in my I want to append the sample of data to my csv file
thanks in advance.
here is my trying.
#sample of data
ID = 5
img_Latitude = 38786454
img_Longitude = 1118468
meta_lat = 45778
meta_long = 886556
#create a function
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
#check if the file is exists, if True
if os.path.isfile('C:/My/Path/compare_coordinates.csv'):
#read the csv file
df = pd.read_csv('compare_coordinates.csv')
#make pd.series
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
#append the data to df
df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('C:/My/Path/compare_coordinates.csv', index=False)
The line df.append(data, ignore_index = True) needs to be:
df = df.append(data, ignore_index = True)
This is because DatFrame.append returns a new DF with the appended lines, it does not append in-place:
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.append.html
To get the values that needed must be saved in variable so for the line
df.append(data, ignore_index = True) to be edited to df = df.append(data, ignore_index = True) and for the getting value of file exists or not as following codes:
def create_csv( ID, img_Latitude, img_Longitude,meta_lat, meta_long):
Path = os.path.isfile('My/path/compare_coordinates1.csv')
if Path==True:
df = pd.read_csv('compare_coordinates1.csv')
data = pd.Series([ID, img_Latitude, img_Longitude, meta_lat, meta_long],
index=['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long'])
df = df.append(data, ignore_index=True)
else:
data = [ID, img_Latitude, img_Longitude, meta_lat, meta_long]
columns = ['ID', 'img_Latitude', 'img_Longitude', 'meta_lat','meta_long']
df = pd.DataFrame(data, columns).T
df.to_csv('My/path/compare_coordinates1.csv', index=False)

How can I make my code recognize the function defined above

I am trying to run through a set of CSV files in order to compile a results CSV file. I'm getting an error that my function is undefined for some reason. Can you tell me why? Thanks.
def response_amp(data):
import pandas as pd
import numpy as np
#putting in and cutting out unnecessary parts of the data
df = pd.read_csv('data.csv', encoding = 'utf-8')
df = df[:-1]
a = df.columns[df.columns.str.startswith('ยต')]
df = df[a]
dfd = df.drop(df.index[:30]) #dropping the section with no sample
#splitting the data into chunks so response values can be measure
df1d = dfd[:320] #first interval
df2d = dfd[330:470] #second interval
df3d = dfd[480:] #third interval
#rolling avg on each
df1r = df1d.rolling(5, win_type='gaussian').sum(std=4)
df2r = df2d.rolling(5, win_type='gaussian').sum(std=4)
df3r = df3d.rolling(5, win_type='gaussian').sum(std=4)
bsln_1 = df1r.iloc[3:6].mean()
bsln_2 = df2r.iloc[3:6].mean()
bsln_3 = df3r.iloc[3:6].mean()
response_1 = abs(df1r.min()-bsln_1)/bsln_1
response_2 = abs(df1r.min()-bsln_2)/bsln_2
response_3 = abs(df1r.min()-bsln_3)/bsln_3
response = response_1,response_2,response_3
return(response)
import os
directory =(r'file directory goes here')
response = []
for filename in os.listdir(directory):
if filename.endswith(".csv"):
response.append(response_amp(filename))
a = numpy.asarray(response)
numpy.savetxt("ks_response.csv", a, delimiter=",")
Thanks for the help.

Categories

Resources