I am trying to modify the following code (I am newbie at python, so try to teach me step by step)
import requests, json
import pandas as pd
class AjaxScraper():
results = []
def fetch(self, url):
return requests.get(url)
def parse(self, content):
self.results = content['data']
for entry in self.results:
del entry['_id']
def to_csv(self):
df = pd.DataFrame(self.results)
pd.to_csv('Table.csv', sep=',', encoding='utf-8',index = False)
def start_me(self):
response = self.fetch('https://scrapingkungfu.herokuapp.com/api?_=1576384789999')
self.parse(response.json())
self.to_csv()
if __name__ == '__main__':
scraper = AjaxScraper()
scraper.start_me()
I have got errors like that
File "demo.py", line 24, in start_me
self.to_csv()
File "demo.py", line 19, in to_csv
pd.to_csv('Table.csv', sep=',', encoding='utf-8',index = False)
AttributeError: module 'pandas' has no attribute 'to_csv'
I wonder why this error appears although I saw many codes that has to_csv in pandas package..!!
** This is a simple dataframe that I need to learn how to reorder the columns using the index of columns
import pandas as pd
name_dict = {
'Name': ['a','b','c','d'],
'Score': [90,80,95,20]
}
df = pd.DataFrame(name_dict)
print (df)
to_csv is a method of a DataFrame object, not of the pandas module.
You need to create a dataframe
Reordering the Dataframe with your example
import pandas as pd
name_dict = {
'Name': ['a','b','c','d'],
'Score': [90,80,95,20]
}
df = pd.DataFrame(name_dict)
print (df)
The solution is creating a new data frame with our desired order
df = df[['Score', 'Name']]
Related
I have the following code that reads some JSON files from a directory and returns them after some preprocessing. However, some of them are dict so they do not have the desired columns. As a result, I take back this error
KeyError: "None of [Index(['aaa', 'xxx'], dtype='object')] are in the [columns]"]
How to ignore them and continue with the other JSON files? Perhaps a try-except procedure?
import os, json
import pandas as pd
path_to_json = 'C:/Users/aaa/Desktop/'
json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
def func(s):
try:
return eval(s)
except:
return dict()
list_of_df=[]
for i in range(len(json_files)):
file_name = json_files[i]
df = pd.read_json(file_name, lines=True)
df= df[['columnx']]
df = df['columnx'].apply(func)
df=pd.json_normalize(df)
df=pd.DataFrame(df[["xxx", "aaa"]])
list_of_df.append(df)
df=pd.concat(list_of_df)
df = df[['Index','xxx', 'aaa']]
df.head()
You have to add the try-except block added in your for loop which iterates over the json files.
I am trying to access data using web-scraping and making it into a data frame using pandas. With the following code, I am already able to get the data frame. I want to combine all the data frames with append into one large data frame.
import requests
import re
import pandas as pd
from urllib.parse import unquote
from json import loads
from bs4 import BeautifulSoup
# Download URL
url = "https://riwayat-file-covid-19-dki-jakarta-jakartagis.hub.arcgis.com/"
req = requests.get(url)
# Get encoded JSON from HTML source
encoded_data = re.search("window\.__SITE=\"(.*)\"", req.text).groups()[0]
# Decode and load as dictionary
json_data = loads(unquote(encoded_data))
# Get the HTML source code for the links
html_src = json_data["site"]["data"]["values"]["layout"]["sections"][1]["rows"][0]["cards"][0]["component"]["settings"]["markdown"]
# Parse it using BeautifulSoup
soup = BeautifulSoup(html_src, 'html.parser')
# Get links
links = soup.find_all('a')
# For each link...
link_list = []
id_list = []
date_list = []
dataframe_csv = []
for link in links:
if "2021" in link.text:
link_list.append(link.text+" - "+link.attrs['href'])
link_list.remove("31 Januari 2021 Pukul 10.00 - https://drive.google.com/file/d/1vd1tToQbx3A420KMDA63aKviLjgGPJMd/view?usp=sharing")
for i in link_list:
id_list.append(i.split("/")[5])
date_list.append(i.split("/")[0][:-21])
for ID in id_list:
dataframe_csv.append("https://docs.google.com/spreadsheets/d/"+ID+"/export?format=csv")
I want to combine all the data frames that I have by using a loop. For every loop, I want to remove the index 0 row and add a new column which is Date. The code is as follows:
date_num = 0
df_total = pd.DataFrame()
for i in dataframe_csv:
df = pd.read_csv(i)
df = df.drop(index=df.index[0], axis=0, inplace=True)
df = df.assign(Date = date_list[date_num])
date_num += 1
df_total.append(df,ignore_index=True)
The problem is, I get an error like this:
AttributeError Traceback (most recent call last)
<ipython-input-11-ef67f0a87a8e> in <module>
5 df = pd.read_csv(i)
6 df = df.drop(index=df.index[0], axis=0, inplace=True)
----> 7 df = df.assign(Date = date_list[date_num])
8
9 date_num += 1
AttributeError: 'NoneType' object has no attribute 'assign'
inplace=True modifies the dataframe directly, so either remove it:
date_num = 0
df_total = pd.DataFrame()
for i in dataframe_csv:
df = pd.read_csv(i)
df = df.drop(index=df.index[0], axis=0)
df = df.assign(Date = date_list[date_num])
date_num += 1
df_total.append(df,ignore_index=True)
Or not assign it back:
date_num = 0
df_total = pd.DataFrame()
for i in dataframe_csv:
df = pd.read_csv(i)
df.drop(index=df.index[0], axis=0, inplace=True)
df = df.assign(Date = date_list[date_num])
date_num += 1
df_total.append(df,ignore_index=True)
As mentioned in the documentation of drop:
inplace : bool, default False
If False, return a copy. Otherwise, do operation inplace and return None.
I'm trying to write my read/write function to a csv, but it can't return any value.
I'm reading from a CSV, replacing the " ; " in the second column with " " and performing and saving the csv already handled.
But for some reason it doesn't save my csv, is my function wrong?
I'm starting out in the Python world, and I'm having a bit of trouble.
import pandas as pd
header_col = ['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';', names=header_col, header=None)
def file_load(df):
df['col1'] = df['col1'].str.replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=';', encoding='utf-8', index=False)
import pandas as pd
def file_load(df):
df['col1'] = str(df['col1']).replace(';',' ')
df.drop(columns=['col8'], inplace=True)
df.drop(columns=['col9'], inplace=True)
return df
def save_file(dataframe):
df = dataframe
df.to_csv('myfile_<date>_treat.csv' ,sep=',', encoding='utf-8',
index=False)
def main():
header_col=
['col0','col1','col2','col3','col4','col5','col6','col7','col8','col9']
df = pd.read_csv('myfile_<date>.csv', encoding="ISO-8859-1", sep=';',
names=header_col, header=None)
df1 = file_load(df)
save_file(df1)
if __name__ == '__main__':
main()
I tried to scrape data by using API and put those result in an CSV file. But when I open my CSV file all the data is put together in 1 column(A). Instead I want the data to be separated in different columns(A & B (and C, D, E, F etc when I want to add info)). How can I do that?
import requests
import pandas as pd
from pandas.compat import StringIO
import numpy as np
import datetime as dt
from dateutil.relativedelta import relativedelta
import csv
csv_file = open('/Users/katewang/Desktop/Test/scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
def get_EOD_data(api_token='5cb671b0b4a790.35526238', session = None, tickers = 'AAPL', start_date = dt.datetime(2018,1,1), end_date = dt.datetime(2018,12,31)):
symbols = tickers
if session is None:
session = requests.Session()
url = 'https://eodhistoricaldata.com/api/eod/%s.US' % symbols
params = {"api_token": api_token, "from": start_date, "to": end_date}
r = session.get(url, params = params)
if r.status_code == requests.codes.ok:
cols=[0,5]
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)
df.fillna(method = 'ffill', inplace = True)
df.fillna(method = 'bfill', inplace = True)
return df
def main():
df_data = get_EOD_data()
csv_writer.writerow([df_data])
if __name__ == '__main__':
main()
csv_file.close()
I expect to see two separate columns.
You're seeing only one column since, out of the two selected columns 0 and 5, you set column 0 to be the index when creating the dataframe. This leaves only column 5 as an actual column.
You can check for yourself by removing index_col = 0 from the line
df = pd.read_csv(StringIO(r.text), skipfooter = 1, parse_dates = [0], engine = 'python', na_values=['nan'], index_col = 0, usecols = cols)
I am trying to normalize a nested json response from this URL. As it is quite nested, i am not able to achieve the following format, can anyone help me in right direction?
I am using this approach to normalize:
from urllib.request import urlopen
import json
from pandas.io.json import json_normalize
import pandas as pd
class jsonResp():
def __init__(self):
global data
global data1
global path
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
requestURL = ("http://data.corkcity.ie/api/3/action/datastore_search?id=6cc1028e-7388-4bc5-95b7-667a59aa76dc") #Request urls for json
responseOpen = urlopen(requestURL)
elevations = responseOpen.read() #Reads the response
data = json.loads(elevations) #Loads the json file for normalization and parsing
df = pd.DataFrame.from_dict(json_normalize(data), orient='columns')
print(df)
if __name__ == '__main__':
obj = jsonResp()
Thanks
Start by navigating to the records and then use json_normalize():
import requests
json_data = requests.get("http://data.corkcity.ie/api/3/action/datastore_search?id=6cc1028e-7388-4bc5-95b7-667a59aa76dc").json()
from pandas.io.json import json_normalize
df = json_normalize(json_data["result"]["records"])