I keep getting the below keyerror and can't figure out what it means or what I should be doing different.
KeyError: "None of [Index(['team totals', 'mp_max', 'fg_max', 'fga_max', 'fg%_max', '3p_max',\n '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max',\n 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max',\n 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max',\n 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max',\n 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max'],\n dtype='object')] are in the [columns]"
my code is
from bs4 import BeautifulSoup
import pandas
import os
SEASONS = list(range(2016, 2017))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
with open(box_score) as f:
html = f.read()
soup = BeautifulSoup(html)
[s.decompose() for s in soup.select("tr.over_header")] # this removes the tr tag with class over_header from the html
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pandas.read_html(str(soup), attrs = {"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pandas.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
df = df.apply(pandas.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"]) #grabs just the teams who played each other
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pandas.concat([basic.iloc[-1:], advanced.iloc[-1:]])
totals.index = totals.index.str.lower() # to lower case
maxes = pandas.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pandas.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary - summary[base_cols]
summaries.append(summary)
summary = pandas.concat(summaries, asix=1).T
game = pandas.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pandas.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info("soup")
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pandas.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")
Related
I couldn't dump the json dict completly
I just could dump the json dict last page. Help me,please.
the coding showed below:
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text,'lxml')
jobs = objSoup.find_all('article',class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:",cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:",job_name)
d = [('公司名稱',cust_name),('職務名稱', job_name)]
j_dict = dict(d)
job_list.append(j_dict)
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H+str(page+1)+url_T
job_list(url)
print('-'*70)
time.sleep(random.randint(3,5))
myjob = {'Job':job_list}
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob,fnObj,indent=2,ensure_ascii=False)
Try this code
jobs_to_dump = [] #### added
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text, 'lxml')
jobs = objSoup.find_all('article', class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:", cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:", job_name)
d = [('公司名稱', cust_name), ('職務名稱', job_name)]
j_dict = dict(d)
jobs_to_dump.append(j_dict) ###modified
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H + str(page + 1) + url_T
job_list(url)
print('-' * 70)
time.sleep(random.randint(3, 5))
myjob = {'Job': jobs_to_dump} #### modified
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob, fnObj, indent=2, ensure_ascii=False)
I am trying to split up a json file from alpha-vantages api into separate files depending on the date. I'm also trying to reformat the file to have blank values in the gaps where dates are missing. The following code is what I have come up with but it gives me the TypeError: 'list' object is not callable". I'm fairly new to python and pandas so I'm sure there is a better way to go about this.
import requests
import pandas as pd
from datetime import datetime, timedelta
from dateutil import parser
import numpy as np
from pandas import DataFrame
import json
symbol = "MSFT"
symbol_list = symbol.split(",")
def num_el(list):
count = 0
for element in list:
count += 1
return count
def csv_make(sy, dar, dat):
csv_file = open(f"{sy}_1min_{dar}.csv", "w", newline="")
csv_file.write(dat)
csv_file.close()
i = 0
x = -1
n = num_el(symbol_list)
while i < n:
namesym = symbol_list[x]
ticker = namesym
api_key = 'APIKEYHERE'
url = f'https://www.alphavantage.co/query?function=TIME_SERIES_INTRADAY&symbol={ticker}&outputsize=full&interval=1min&apikey={api_key}'
data = requests.get(url)
dsf = data.json()
daf = pd.DataFrame(dsf['Time Series (1min)'])
dxf: DataFrame = daf.T
dxf.index.name = 'time'
dxf.reset_index(inplace=True)
dxf['time'] = pd.to_datetime(dxf['time'])
dxf['minute'] = dxf['time'].dt.time
dxf['day'] = dxf['time'].dt.day
dxf['date'] = dxf['time'].dt.date
agg = dxf.groupby([dxf['day']])
length1 = dxf.groupby([dxf['day']]).size()
length = pd.DataFrame(length1)
length.index.name = 'day'
length.reset_index(inplace=True)
length_sum = length[0].sum()
v = 0
d = length_sum
b = len(length)
x2 = length_sum
while v < b:
a = length[0][v]
x2 -= length[0][v]
xd = agg.get_group(length['day'][v])
date = xd['date'][x2]
max_dt = parser.parse(str(max(xd['minute'])))
min_dt = parser.parse(str(min(xd['minute'])))
dt_range = []
while min_dt <= max_dt:
dt_range.append(min_dt.strftime("%H:%M:%S"))
min_dt += timedelta(seconds=60)
complete_df = pd.DataFrame({'minute': dt_range})
xy = complete_df.astype('str')
yx = xd.astype('str')
dasf = xy.merge(yx, how='left', on='minute')
dasf['ev'] = np.where(dasf['1. open'].notnull(), 'False', 'True')
time = []
open = []
high = []
low = []
close = []
volume = []
empty_value = []
for ib in range(len(dasf)):
time.append(dasf['minute'][ib])
open.append(dasf['1. open'][ib])
high.append(dasf['2. high'][ib])
low.append(dasf['3. low'][ib])
close.append(dasf['4. close'][ib])
volume.append(dasf['5. volume'][ib])
empty_value.append(dasf['ev'][ib])
time_df = pd.DataFrame(time).rename(columns={0: 'Time'})
open_df = pd.DataFrame(open).rename(columns={0: 'Open'})
high_df = pd.DataFrame(high).rename(columns={0: 'High'})
low_df = pd.DataFrame(low).rename(columns={0: 'Low'})
close_df = pd.DataFrame(close).rename(columns={0: 'Close'})
volume_df = pd.DataFrame(volume).rename(columns={0: 'Volume'})
empty_value_df = pd.DataFrame(empty_value).rename(columns={0: 'Empty Value'})
frames = [time_df, open_df, high_df, low_df, close_df, volume_df, empty_value_df]
df = pd.concat(frames, axis=1, join='inner')
df = df.set_index('Time')
ad = df.to_csv()
csv_make(namesym, date, ad)
v += 1
i += 1
When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from googletrans import Translator
translator = Translator()
rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []
START_PAGE = 1
END_PAGE = 42
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')
sleep(randint(2,10))
for container in tbody:
col1 = container.find_all('tr', {'data-id':'26079'})
for info in col1:
col_1 = info.find_all('td')
for data in col_1:
party = data[0]
party_trans = translator.translate(party)
pty_n.append(party_trans)
pvotes = data[1]
pv1.append(pvotes)
pshare = data[2]
pvs1.append(pshare)
mandates = data[3]
seat.append(mandates)
col2 = container.find_all('tr', {'data-id':'26075'})
for info in col2:
col_2 = info.find_all('td')
for data in col_2:
party2 = data[0]
party_trans2 = translator.translate(party2)
pty_n.append(party_trans2)
pvotes2 = data[1]
pv1.append(pvotes2)
pshare2 = data[2]
pvs1.append(pshare2)
mandates2 = data[3]
seat.append(mandates2)
col3 = container.find_all('tr', {'data-id':'26063'})
for info in col3:
col_3 = info.find_all('td')
for data in col_3:
party3 = data[0].text
party_trans3 = translator.translate(party3)
pty_n.extend(party_trans3)
pvotes3 = data[1].text
pv1.extend(pvotes3)
pshare3 = data[2].text
pvs1.extend(pshare3)
mandates3 = data[3].text
seat.extend(mandates3)
col4 = container.find_all('tr', {'data-id':'26091'})
for info in col4:
col_4 = info.find_all('td',recursive=True)
for data in col_4:
party4 = data[0]
party_trans4 = translator.translate(party4)
pty_n.extend(party_trans4)
pvotes4 = data[1]
pv1.extend(pvotes4)
pshare4 = data[2]
pvs1.extend(pshare4)
mandates4 = data[3]
seat.extend(mandates4)
col5 = container.find_all('tr', {'data-id':'26073'})
for info in col5:
col_5 = info.find_all('td')
for data in col_5:
party5 = data[0]
party_trans5 = translator.translate(party5)
pty_n.extend(party_trans5)
pvotes5 = data[1]
pv1.extend(pvotes5)
pshare5 = data[2]
pvs1.extend(pshare5)
mandates5 = data[3]
seat.extend(mandates5)
col6 = container.find_all('tr', {'data-id':'26080'})
for info in col6:
col_6 = info.find_all('td')
for data in col_6:
party6 = data[0]
party_trans6 = translator.translate(party6)
pty_n.extend(party_trans6)
pvotes6 = data[1]
pv1.extend(pvotes6)
pshare6 = data[2]
pvs1.extend(pshare6)
mandates6 = data[3]
seat.extend(mandates6)
#### TOTAL VOTES ####
tfoot = soup.find_all('tfoot')
for data in tfoot:
fvote = data.find_all('td')
for info in fvote:
votefinal = info.find(text=True).get_text()
fvoteindiv = [votefinal]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
#### CONSTITUENCY NAMES ####
constit = soup.find_all('a', class_='btn btn-link last')
for data in constit:
names = data.get_text()
names_clean = names.replace("Sejum Constituency no.","")
names_clean2 = names_clean.replace("[","")
names_clean3 = names_clean2.replace("]","")
namesfinal = names_clean3.split()[1]
constitindiv = [namesfinal]
constitlist = constitindiv * (len(pty_n) - len(cst_n))
cst_n.extend(constitlist)
#### UNSCRAPABLE INFO ####
region = 'Europe'
reg2 = [region]
reglist = reg2 * (len(pty_n) - len(rg))
rg.extend(reglist)
country = 'Poland'
ctr2 = [country]
ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
ctr_n.extend(ctrlist)
year = '2019'
yr2 = [year]
yrlist = yr2 * (len(pty_n) - len(yr))
yr.extend(yrlist)
month = '10'
mo2 = [month]
molist = mo2 * (len(pty_n) - len(mn))
mn.extend(molist)
codes = ''
codes2 = [codes]
codeslist = codes2 * (len(pty_n) - len(manual))
manual.extend(codeslist)
noinfo = '-990'
noinfo2 = [noinfo]
noinfolist = noinfo2 * (len(pty_n) - len(no_info))
no_info.extend(noinfolist)
print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))
poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})
print(poland19)
poland19.to_csv('poland_19.csv')
As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:
from selenium import webdriver
wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')
You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/
Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason.
Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:
tbody = soup.find_all('table', id="DataTables_Table_0")
And again, by doing the get requests with the selenium lib.
Hope that was helpful :)
Cheers
I'm getting to error 'find() takes no keyword arguments' on the line of code place = racers.find('td', class_='horse_number').get_text()
I presume this is due to the nested for loop - is find onto find the problem??
My goal is to get detail of the race in first loop, second loop reiterate over each runner within the race, third for loop to get the times that meet each nested if statement.
for race in results:
race_number = race.find('td', class_='raceNumber').get_text()
race_name1 = race.find('td', class_='raceTitle').get_text()
race_title1 = race.find('td', class_='raceInformation').get_text()
race_title1 = ' '.join(race_title1.split())
race_distance1 = race.find('td', class_='distance').get_text()
tableofdata = race.find('table', class_='raceFieldTable')
for racers in tableofdata:
place = racers.find('td', class_='horse_number').get_text()
horsename = racers.find('a', class_='horse_name_link')
horsename = horsename.text.replace('HorseName: ', '') if horsename else ''
prizemoney = racers.find('td', class_='prizemoney')
prizemoney = prizemoney.text.replace('Prizemoney: ', '') if prizemoney else ''
barrier = racers.find('td', class_='barrier')
barrier = barrier.text.replace('Row: ', '') if barrier else ''
#tabnumber = race.find('td', class_='horse_number')
#tabnumber = tabnumber.text.replace('HorseNumber: ', '') if tabnumber else ''
#print(tabnumber, tr2)
trainer = racers.find_all('td', class_='trainer-short')
trainer = trainer.text.replace('Trainer: ', '') if trainer else ''
driver = racers.find_all('td', class_='driver-short')
driver = driver.text.replace('Driver: ', '') if driver else ''
margin = racers.find_all('td', class_='margin')
margin = margin.text.replace('Margin: ', '') if margin else ''
startingprice = racers.find_all('td', class_='starting_price')
startingprice = startingprice.text.replace('StartingOdds: ', '')
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
stewardscomments = racers.find_all('span', class_='stewardsTooltip')
stewardscomments = stewardscomments.text.replace('StewardsComments: ', '') if horsename else ''
scratchingnumber = racers.find_all('td', class_='number')
scratchingnumber = scratchingnumber.text.replace('Scratching: ', '') if scratchingnumber else ''
tableoftimes = race.find('table', class_='raceTimes')
for row in tableoftimes.select('td>strong:contains(":")'):
for t in row:
if "Track Rating:" in t:
trackrating = t.next_element.strip()
else:
trackrating = ''
if "Gross Time:" in t:
grosstime = t.next_element.strip()
else:
grosstime = ''
if "Mile Rate:" in t:
milerate = t.next_element.strip()
else:
milerate = ''
if "Lead Time:" in t:
leadtime = t.next_element.strip()
else:
leadtime = ''
if "First Quarter:" in t:
firstquarter = t.next_element.strip()
else:
firstquarter = ''
if "Second Quarter:" in t:
secondquarter = t.next_element.strip()
else:
secondquarter = ''
if "Third Quarter:" in t:
thirdquarter = t.next_element.strip()
else:
thirdquarter = ''
if "Fourth Quarter:" in t:
fourthquarter = t.next_element.strip()
else:
fourthquarter = ''
Last query is this replace doesnt work - still prints $2.40Â onto csv file
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
....
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
....
writer.writerow([tr2, race_number, race_name1, race_title1, race_distance1, place, horsename, prizemoney, barrier, trainer, driver, margin, startingprice, stewardscomments, scratchingnumber, trackrating, grosstime, milerate, leadtime, firstquarter, secondquarter, thirdquarter, fourthquarter])
UPDATED
Start of HTML with scraping looks like below
from datetime import datetime, date, timedelta
import requests
import re
import csv
import os
import numpy
import pandas as pd
from bs4 import BeautifulSoup as bs
from simplified_scrapy import SimplifiedDoc,req,utils
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = bs(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
enddate = datetime(2020, 4, 20)
#prints header in csv
writer.writerow(['Venue', 'RaceNumber', 'RaceName', 'RaceTitle', 'RaceDistance', 'Place', 'HorseName', 'Prizemoney', 'Row', 'Trainer', 'Driver', 'Margin', 'StartingOdds', 'StewardsComments', 'Scratching', 'TrackRating', 'Gross_Time', 'Mile_Rate', 'Lead_Time', 'First_Quarter', 'Second_Quarter', 'Third_Quarter', 'Fourth_Quarter'])
while enddate <= yesterday:
enddate += timedelta(days=1)
enddate1 = enddate.strftime("%d-%m-%y")
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = bs(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
tr = table1.find_all('tr', {'class':['odd', 'even']})
for tr1 in tr:
tr2 = tr1.find('a').get_text()
tr3 = tr1.find('a')['href']
newurl = base1_url + tr3
with requests.Session() as s:
webpage_response = s.get(newurl)
soup = bs(webpage_response.content, "html.parser")
#soup1 = soup.select('.content')
results = soup.find_all('div', {'class':'forPrint'})
#resultsv2 = soup.find_all('table', {'class':'raceFieldTable'})
Expect the CSV to look like
I'm trying to extract all WC 2019 players batting stats, query got stuck with an error "list index out of range" at the player: http://www.espncricinfo.com/india/content/player/398438.html
How can I handle exception or PASS to get complete team player stats?
url2 = 'http://stats.espncricinfo.com/ci/engine/player/' + \
str(player_id) + \
'.htmlclass=2;template=results;type=batting;view=innings'
html = urllib.request.urlopen(url2, context=ctx).read()
temp_data = OrderedDict()
list_of_dict = []
bs = BeautifulSoup(html, 'lxml')
table_body = bs.find_all('tbody')
rows = table_body[1].find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [x.text.strip() for x in cols]
temp_data = OrderedDict()
for i in range(len(cols)):
temp_data["Runs"] = cols[0]
temp_data["Mins"] = cols[1]
temp_data["BF"] = cols[2]
temp_data["fours"] = cols[3]
temp_data["sixs"] = cols[4]
temp_data["SR"] = cols[5]
temp_data["POS"] = cols[6]
temp_data["Dismissal"] = cols[7]
temp_data["Inns"] = cols[8]
temp_data["Opposition"] = cols[10]
temp_data["Ground"] = cols[11]
temp_data["Date"] = cols[12]
temp_data["player"] = player
temp_data["playerid"] = player_id
list_of_dict.append(temp_data)
df = pd.DataFrame(list_of_dict)
df
df.to_sql("dummy", con, if_exists="append")
I'd like to extract all WC squad wise player stats.