Hi a learner in python I have written code the extracts a son file from a sports website.
The code is
from bs4 import BeautifulSoup
import requests
import json
url = "https://s3-ap-southeast-2.amazonaws.com/racevic.static/2022-08-01/sportsbet-pakenham-synthetic/sectionaltimes/race-2.json?"
payload={}
headers = {}
response = requests.request("GET", url, headers=headers, data=payload)
print(response.text)
and the output looks like this (small portion)
sectionaltimes_callback({"Horses":[{"Comment":"Slow Out 1 Lengths , got back 2nd last off tardy start 8 Lengths 800m, still mile off them getting widest from the corner, charged home last 200m for eye catching second spot # powered home widest","FinalPosition":2,"FinalPositionAbbreviation":"2nd","FullName":"Ameerati","SaddleNumber":12,"HorseUrl":"/horses/ameerati","SilkUrl":"//cdn.silks.racing.com/bb/114031.png","Trainer":"Robbie Griffiths & Mathew de Kock","TrainerUrl":"/trainers/robbie-griffiths","Jockey":"P.M.Moloney","JockeyUrl":"/jockeys/patrick-moloney","SectionalTimes":[{"Distance":"1200m","Position":11,"Time":"1:11.43","AvgSpeed":0.0},{"Distance":"1000m","Position":11,"Time":"59.29","AvgSpeed":0.0},{"Distance":"800m","Position":11,"Time":"46.95","AvgSpeed":0.0},{"Distance":"600m","Position":11,"Time":"34.77","AvgSpeed":0.0},{"Distance":"400m","Position":11,"Time":"22.71","AvgSpeed":0.0},{"Distance":"200m","Position":4,"Time":"11.45","AvgSpeed":0.0},{"Distance":"Finish","Position":2,"Time":"","AvgSpeed":0.0}],"SplitTimes":[{"Distance":"1200m-1000m","Position":11,"Time":"12.14","AvgSpeed":0.0},{"Distance":"1000m-800m","Position":11,"Time":"12.34","AvgSpeed":0.0},{"Distance":"800m-600m","Position":11,"Time":"12.18","AvgSpeed":0.0},{"Distance":"600m-400m","Position":11,"Time":"12.06","AvgSpeed":0.0},{"Distance":"400m-200m","Position":11,"Time":"11.26","AvgSpeed":0.0},{"Distance":"200m-Finish","Position":4,"Time":"11.45","AvgSpeed":0.0}],"StartPosition":0,"BarrierNumber":12,"RaceTime":"","TimeVarToWinner":0.0,"BeatenMargin":0.0,"DistanceRun":0,"DistanceVarToWinner":"","SixHundredMetresTime":"34.77","TwoHundredMetresTime":"11.45","Early":0.0,"Mid":0.0,"Late":0.0,"OverallPeakSpeed":0.0,"PeakSpeedLocation":null,"OverallAvgSpeed":0.0,"DistanceFromRail":0.0},
The help I would appreciate now is what do I do to put this in a format that I can open in excel
import pandas as pd
import requests
import json
from openpyxl import Workbook
from openpyxl.utils import get_column_letter
from openpyxl.utils.dataframe import dataframe_to_rows
def race_data_to_xslxs(url, fname):
# get data
data = json.loads(requests.get(url).text[24:-1])
# create dataframes
dfs = {}
singulars = pd.DataFrame()
for k, v in data.items():
if isinstance(v, list):
dfs[k] = pd.DataFrame(v)
else:
singulars[k] = [v]
dfs = {'summary': singulars, **dfs}
# create workbook
wb = Workbook()
for k, df in dfs.items():
# create sheet
wsx = wb.create_sheet(title=k)
rows = dataframe_to_rows(df)
for r_idx, row in enumerate(rows, 1):
for c_idx, value in enumerate(row, 1):
wsx.cell(row=r_idx, column=c_idx, value=str(value))
del wb['Sheet']
# write excel file
wb.save(filename=fname)
url = "https://s3-ap-southeast-2.amazonaws.com/racevic.static/2022-08-01/sportsbet-pakenham-synthetic/sectionaltimes/race-2.json?"
path = 'fname.xlsx'
race_data_to_xslxs(url=url, fname=path)
The API is returning JSONP, not JSON. This is JSON wrapped in a call to a callback function, which can be used by browsers without violating the same-origin rule. You need to remove that function call before parsing it as JSON.
import re
import json
response = requests.request("GET", url, headers=headers, data=payload)
json_string = re.sub(r'^sectionaltimes_callback\((.*)\)$', r'\1', response)
data = json.loads(json_string)
You can try this -
import requests
import json
url = "https://s3-ap-southeast-2.amazonaws.com/racevic.static/2022-08-01/sportsbet-pakenham-synthetic/sectionaltimes/race-2.json?"
response = requests.get(url)
json.loads(response.text.split("(", 1)[1].strip(")"))
I'm playing around with some code to read JSON encoded data from a URL, push it into a data frame and save the results to a CSV. The code that I attempted to run is shown below. I think this is pretty close, but something is wrong, because nothing gets downloaded.
import urllib
from urllib.request import urlopen
import json
import pandas as pd
from pandas.io.json import json_normalize
all_links = ['https://www.baptisthealthsystem.com/docs/global/standard-charges/474131755_abrazomaranahospital_standardcharges.json?sfvrsn=9a27928_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621861138_abrazocavecreekhospital_standardcharges.json?sfvrsn=674fd6f_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621809851_abrazomesahospital_standardcharges.json?sfvrsn=13953222_2',
'https://www.baptisthealthsystem.com/docs/global/standard-charges/621811285_abrazosurprisehospital_standardcharges.json?sfvrsn=c8113dcf_2']
for item in all_links:
#print(item)
try:
length = len(item)
first_under = item.find('_') + 1
last_under = item.rfind('?') - 21
file_name = item[first_under:last_under]
print(file_name)
# store the response of URL
response = urlopen(item)
data = json.loads(response.read())
#print(type(data))
data = json.loads(item.read().decode())
df = pd.DataFrame(json_normalize(data, 'metrics'), encoding='mac_roman')
DOWNLOAD_PATH = 'C:\\Users\\ryans\\Desktop\\hospital_data\\' + file_name + '.csv'
urllib.request.urlretrieve(df,DOWNLOAD_PATH)
except Exception as e: print(e)
Any thoughts on what could be wrong here?
I am trying to modify the following code (I am newbie at python, so try to teach me step by step)
import requests, json
import pandas as pd
class AjaxScraper():
results = []
def fetch(self, url):
return requests.get(url)
def parse(self, content):
self.results = content['data']
for entry in self.results:
del entry['_id']
def to_csv(self):
df = pd.DataFrame(self.results)
pd.to_csv('Table.csv', sep=',', encoding='utf-8',index = False)
def start_me(self):
response = self.fetch('https://scrapingkungfu.herokuapp.com/api?_=1576384789999')
self.parse(response.json())
self.to_csv()
if __name__ == '__main__':
scraper = AjaxScraper()
scraper.start_me()
I have got errors like that
File "demo.py", line 24, in start_me
self.to_csv()
File "demo.py", line 19, in to_csv
pd.to_csv('Table.csv', sep=',', encoding='utf-8',index = False)
AttributeError: module 'pandas' has no attribute 'to_csv'
I wonder why this error appears although I saw many codes that has to_csv in pandas package..!!
** This is a simple dataframe that I need to learn how to reorder the columns using the index of columns
import pandas as pd
name_dict = {
'Name': ['a','b','c','d'],
'Score': [90,80,95,20]
}
df = pd.DataFrame(name_dict)
print (df)
to_csv is a method of a DataFrame object, not of the pandas module.
You need to create a dataframe
Reordering the Dataframe with your example
import pandas as pd
name_dict = {
'Name': ['a','b','c','d'],
'Score': [90,80,95,20]
}
df = pd.DataFrame(name_dict)
print (df)
The solution is creating a new data frame with our desired order
df = df[['Score', 'Name']]
i want to scrape web data using input values from excel and scraping web for each row_value taken and save the output to same excel file.
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
ciks = pd.read_csv("ciks.csv")
ciks.head()
output
CIK
0 1557822
1 1598429
2 1544670
3 1574448
4 1592290
then
for x in ciks:
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + x +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode)
t = soup.find('span',{'class':'companyName'})
print(t.text)
i got an erorr :
----> 9 print (t.text)
AttributeError: 'NoneType' object has no attribute 'text'
here, i want to scrape web data taking each row value as input from the CSV file.
It would be easier to convert the column values as list and then use it in the for loop - see solution below,
from bs4 import BeautifulSoup
import requests
from urllib import request
import os
import pandas as pd
#ciks = pd.read_csv("ciks.csv")
df = pd.read_csv("ciks.csv")
mylist = df['CIK'].tolist()# CIK is the column name
company =[]
for item in mylist:
print(item)
url="https://www.sec.gov/cgi-bin/browse-edgar?CIK=" + str(item) +"&owner=exclude&action=getcompany"
r = request.urlopen(url)
bytecode = r.read()
htmlstr = bytecode.decode()
soup = BeautifulSoup(bytecode,features="lxml")
t = soup.find('span',{'class':'companyName'})
company.append(t.text)
print(t.text)
df.assign(company= company)
print(df)
df.to_csv("ciks.csv")
I'm very new at this and I'm having trouble writing a CSV file from a webscraping. Can anyone help? Thank you!
import sys
import urllib2
import csv
import requests
from bs4 import BeautifulSoup
#web scraping wind data
r_wind = requests.get('http://w1.weather.gov/data/obhistory/KCQX.html')
html_wind = r_wind.text
soup = BeautifulSoup(html_wind, "html.parser")
table = soup.find('table')
rows_wind = table.findAll('tr')
rows_wind = rows_wind[1:]
#writing to a csv file
csvfile_wind = open("wind.csv","wb")
output_wind = csv.writer(csvfile_wind, delimiter=',',quotechar='"',quoting=csv.QUOTE_MINIMAL)
for row in rows_wind:
Date = cells[0].text.strip()
Time = cells[1].text.strip()
Wind = cells[2].text.strip()
output.writerow([Date,Time,Wind])
csvfile_wind.close()