When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
for page in range(100, 350):
page = requests.get("https://www.ghanaweb.com/GhanaHomePage/election2012/parliament.constituency.php?ID=" + str(page) + "&res=pm")
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('tbody')
sleep(randint(2,10))
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
if cand is not None:
can2 = info.get_text()
can.append(can2)
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
if partyn is not None:
partyn2 = data.get_text()
pty_n.append(partyn2)
#### CANDIDATE VOTES ####
votec = container.find_all('td', class_='votes')
for data in votec:
if votec is not None:
votec2 = data.get_text()
cv1.append(votec2)
#### CANDIDATE VOTE SHARE ####
cansh = container.find_all('td', class_='percent')
for data in cansh:
if cansh is not None:
cansh2 = data.get_text()
cvs1.append(cansh2)
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
if fvote is not None:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
Thanks in advance for your help!
I've made some simplification changes. The major changes that needed to be changed were:
ghana_tbody = soup.find_all('table', class_='canResults')
can2 = info # not info.get_text()
I have only tested this against page 112; life is too short.
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
can = []
pty_n = []
cv1 = []
cvs1 = []
vot1 = []
START_PAGE = 112
END_PAGE = 112
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://www.ghanaweb.com/GhanaHomePage/election2012/parliament.constituency.php?ID=112&res=pm")
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('table', class_='canResults')
sleep(randint(2,10))
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
can2 = info # not info.get_text()
can.append(can2)
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
partyn2 = data.get_text()
pty_n.append(partyn2)
#### CANDIDATE VOTES ####
votec = container.find_all('td', class_='votes')
for data in votec:
votec2 = data.get_text()
cv1.append(votec2)
#### CANDIDATE VOTE SHARE ####
cansh = container.find_all('td', class_='percent')
for data in cansh:
cansh2 = data.get_text()
cvs1.append(cansh2)
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
print('can = ', can)
print('pty_n = ', pty_n)
print('cv1 = ', cv1)
print('cvs1 = ', cvs1)
print('vot1 = ', vot1)
Prints:
can = ['Kwadwo Baah Agyemang', 'Daniel Osei', 'Anyang - Kusi Samuel', 'Mary Awusi']
pty_n = ['NPP', 'NDC', 'IND', 'IND']
cv1 = ['14,966', '9,709', '8,648', '969', '34292']
cvs1 = ['43.64', '28.31', '25.22', '2.83', '\xa0']
vot1 = ['34292', '34292', '34292', '34292']
Be sure to first change START_PAGE and END_PAGE to 100 and 350 respecively.
Related
I'm looking to scrape a set of URLs - I want to visit each link on the given URL, and return the player's pos1 pos2 and profile details.
I have two sets of URLs I'm looking at, G League players (which is working perfectly) and International Players (which I'm completely stuck on).
The sites seem to be almost identical, but not sure what's going on.
WORKING G LEAGUE SCRIPT:
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('SSID')
worksheet = sh.get_worksheet(0)
# AddValue = ["Test", 25, "Test2"]
# worksheet.insert_row(AddValue, 3)
def get_links(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.find_all('td', {'data-th': 'Player'}):
a_tag = td.a
name = a_tag.text
player_url = a_tag['href']
pos = td.find_next_sibling('td').text
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name": name, "URL": player_url, "pos_option1": pos}
row['pos_option2'] = div_profile_box.h2.span.text
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls = [
'https://basketball.realgm.com/dleague/players/2022',
'https://basketball.realgm.com/dleague/players/2021',
'https://basketball.realgm.com/dleague/players/2020',
'https://basketball.realgm.com/dleague/players/2019',
'https://basketball.realgm.com/dleague/players/2018',
]
res = []
for url in urls:
print(f"Getting: {url}")
data = get_links(url)
res = [*res, *data]
if res != []:
header = list(res[0].keys())
values = [
header, *[[e[k] if e.get(k) else "" for k in header] for e in res]]
worksheet.append_rows(values, value_input_option="USER_ENTERED")
Like I stated, this prints the positions along with the rest of the profile details. I'm trying to recreate for a different set of URLs, but hitting the error:
This is the script I'm stuck on, any thoughts?
import requests
from bs4 import BeautifulSoup
import gspread
gc = gspread.service_account(filename='creds.json')
sh = gc.open_by_key('1DpasSS8yC1UX6WqAbkQ515BwEEjdDL-x74T0eTW8hLM')
worksheet = sh.get_worksheet(0)
# AddValue = ["Test", 25, "Test2"]
# worksheet.insert_row(AddValue, 3)
def get_links2(url):
data = []
req_url = requests.get(url)
soup = BeautifulSoup(req_url.content, "html.parser")
for td in soup.select('td.nowrap'):
a_tag = td.a
if a_tag:
name = a_tag.text
player_url = a_tag['href']
pos = td.find_next_sibling('td').text
print(f"Getting {name}")
req_player_url = requests.get(
f"https://basketball.realgm.com{player_url}")
soup_player = BeautifulSoup(req_player_url.content, "html.parser")
div_profile_box = soup_player.find("div", class_="profile-box")
row = {"Name": name, "URL": player_url, "pos_option1": pos}
row['pos_option2'] = div_profile_box.h2.span.text
for p in div_profile_box.find_all("p"):
try:
key, value = p.get_text(strip=True).split(':', 1)
row[key.strip()] = value.strip()
except: # not all entries have values
pass
data.append(row)
return data
urls2 = ["https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/player/All/desc","https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/2",
"https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/3",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/4",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/5",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/6",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/7",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/8",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/9",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/10",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/11",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/12",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/13",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/14",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/15",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/16",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/17",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/18",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/19",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/20",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/21",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/22",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/23",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/24",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/25",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/26",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/27",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/28",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/29",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/30",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/31",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/32",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/33",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/34",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/35",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/36",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/37",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/38",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/39",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/40",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/41",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/42",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/43",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/44",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/45",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/46",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/47",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/48",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/49",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/50",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/51",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/52",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/53",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/54",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/55",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/56",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/57",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/58",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/59",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/60",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/61",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/62",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/63",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/64",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/65",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/66",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/67",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/68",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/69",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/70",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/71",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/72",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/73",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/74",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/75",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/76",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/77",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/78",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/79",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/80",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/81",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/82",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/83",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/84",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/85",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/86",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/87",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/88",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/89",
# "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/90",
# # "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/91",
# # "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/92",
# # "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/93",
# # "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/94",
# # "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/95",
# # "https://basketball.realgm.com/international/stats/2023/Averages/Qualified/All/minutes/All/desc/96"]
]
res2 = []
for url in urls2:
data = get_links2(url)
res2 = [*res2, *data]
# print(res2)
if res2 != []:
header = list(res2[0].keys())
values = [
header, *[[e[k] if e.get(k) else "" for k in header] for e in res2]]
worksheet.append_rows(values, value_input_option="USER_ENTERED")
As mentioned there are differences in the HTML so be aware:
pos = td.find_next_sibling('td').text will lead to wrong information, cause there is no position column in these tables of the new url set.
To get the position from the profile check if the element that holds teh information is available before calling .text
row['pos_option2'] = div_profile_box.h2.span.text if div_profile_box.h2.span else None
So you would get:
Used this url https://basketball.realgm.com/international/league/119/VTB-Youth-United-League/team/1952/Avtodor-2/stats to start start the get_links2(url), because there was no indicator in your question, where the issue appears
{'Name': 'Klim Adaykin',
'URL': '/player/Klim-Adaykin/Summary/207122',
'pos_option1': 'AV2',
'pos_option2': None,
'Current Team': 'Avtodor-2',
'Nationality': 'Russia',
'Current NBA Status': 'Draft Eligible in 2023',
'Draft Entry': '2023 NBA Draft',
'Pre-Draft Team': 'Avtodor-2 (Russia)'}
I am trying to scrape data from a job searching platform called Job Street. The web crawler works but the generated csv file is empty with no data. The expected output should be a list of jobs with the job title, description, etc.
Below is my code. I am performing this task using selenium. I would really appreciate the help. Thank you in advanced.
headers = {'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) (KHTML, like Gecko) Chrome/102.0.5005.61 '}
path = "/Users/Downloads/jobstreet-scraper-main/chromedriver"
driver = Chrome(executable_path=path)
time.sleep(2)
base_url = "https://www.jobstreet.com.my/en/job-search/{}-jobs/{}/"
def get_page_number(keyword):
#input: keyword for job_postings
#output: number of pages
url = base_url.format(keyword, 1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
result_text = soup.find("span",{"class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _1d0g9qk4 _18qlyvc8"})
results = result_text.text.split()
result = result_text.text.split()[-2]
result1 = result.replace(',','')
result2 = int(result1)
page_number = math.ceil(result2/30)
return page_number
def job_page_scraper(link):
url = "https://www.jobstreet.com.my"+link
print("scraping...", url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
scripts = soup.find_all("script")
for script in scripts:
if script.contents:
txt = script.contents[0].strip()
if 'window.REDUX_STATE = ' in txt:
jsonStr = script.contents[0].strip()
jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
jsonStr = jsonStr.split('}}}};')[0].strip()
jsonStr = jsonStr+"}}}}"
jsonObj = json.loads(jsonStr)
job = jsonObj['details']
job_id = job['id']
job_expired = job['isExpired']
job_confidential = job['isConfidential']
job_salary_min = job['header']['salary']['min']
job_salary_max = job['header']['salary']['max']
job_salary_currency = job['header']['salary']['currency']
job_title = job['header']['jobTitle']
company = job['header']['company']['name']
job_post_date = job['header']['postedDate']
job_internship = job['header']['isInternship']
company_website = job['companyDetail']['companyWebsite']
company_avgProcessTime = job['companyDetail']['companySnapshot']['avgProcessTime']
company_registrationNo = job['companyDetail']['companySnapshot']['registrationNo']
company_workingHours = job['companyDetail']['companySnapshot']['workingHours']
company_facebook = job['companyDetail']['companySnapshot']['facebook']
company_size = job['companyDetail']['companySnapshot']['size']
company_dressCode = job['companyDetail']['companySnapshot']['dressCode']
company_nearbyLocations = job['companyDetail']['companySnapshot']['nearbyLocations']
company_overview = job['companyDetail']['companyOverview']['html']
job_description = job['jobDetail']['jobDescription']['html']
job_summary = job['jobDetail']['summary']
job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
job_requirement_fieldOfStudy = job['jobDetail']['jobRequirement']['fieldOfStudy']
#job_requirement_industry = job['jobDetail']['jobRequirement']['industryValue']['label']
job_requirement_skill = job['jobDetail']['jobRequirement']['skills']
job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
job_languages = job['jobDetail']['jobRequirement']['languages']
job_benefits = job['jobDetail']['jobRequirement']['benefits']
job_apply_url = job['applyUrl']['url']
job_location_zipcode = job['location'][0]['locationId']
job_location = job['location'][0]['location']
job_country = job['sourceCountry']
return [job_id, job_title, job_expired, job_confidential, job_salary_max, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_website, company_avgProcessTime, company_registrationNo, company_workingHours, company_facebook, company_size, company_dressCode, company_nearbyLocations, company_overview, job_description, job_summary, job_requirement_career_level, job_requirement_fieldOfStudy, job_requirement_yearsOfExperience, job_requirement_qualification, job_requirement_skill, job_employment_type, job_languages, job_benefits, job_apply_url, job_location_zipcode, job_location, job_country]
def page_crawler(keyword):
# input: keyword for job postings
# output: dataframe of links scraped from each page
# page number
page_number = get_page_number(keyword)
job_links = []
for n in range(page_number):
print('Loading page {} ...'.format(n+1))
url = base_url.format(keyword, n+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#extract all job links
links = soup.find_all('a',{'class':'sx2jih0'})
job_links += links
jobs = []
for link in job_links:
job_link = link['href'].strip().split('?', 1)[0]
jobs.append([keyword, job_link] + job_page_scraper(job_link))
result_df = pd.DataFrame(jobs, columns = ['keyword', 'link', 'job_id', 'job_title', 'job_expired', 'job_confidential', 'job_salary_max', 'job_salary_max', 'job_salary_currency', 'company', 'job_post_date', 'job_internship', 'company_website', 'company_avgProcessTime', 'company_registrationNo', 'company_workingHours', 'company_facebook', 'company_size', 'company_dressCode', 'company_nearbyLocations', 'company_overview', 'job_description', 'job_summary', 'job_requirement_career_level', 'job_requirement_fieldOfStudy', 'job_requirement_yearsOfExperience', 'job_requirement_qualification', 'job_requirement_skill', 'job_employment_type', 'job_languages', 'job_benefits', 'job_apply_url', 'job_location_zipcode', 'job_location', 'job_country'])
return result_df
def main():
# a list of job roles to be crawled
key_words = ['medical doctor']
dfs = []
for key in key_words:
key_df = page_crawler(key)
dfs.append(key_df)
# save scraped information as csv
pd.concat(dfs).to_csv("job_postings_results.csv")
if __name__ == '__main__':
main()
I'm having trouble with my web scraper not getting the "Odds" values and not sure what is wrong. For each piece of information, I am using a try/except to see if the element is available. I'm not sure what is wrong with getting the Odds values though. Thanks for the help
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
odd_margin = '-'
odd_avail = False
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
sim_team_win = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds, simulation margin points, and optimal bet
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
if sim_margin > odd_margin:
bet = odd_team_win
else:
if odd_team_win == homeTeam:
bet = awayTeam
else:
bet = homeTeam
else:
diff = odd_margin + sim_margin
bet = sim_team_win
else:
diff = -1
bet = '-'
# Create table
row = {cols[0]: time, 'Matchup': matchup, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff, 'Bet' : bet}
rows.append(row)
df = pd.DataFrame(rows)
df = df.sort_values(by = ['Diff'], ascending = False)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When I run this code everything works perfectly and gets all other values but all the odds values in the table are '-'.
I added a few things into the code, to account for
If the odds are Even (versus if there are no odds
If a team doesn't have a logo, to still but the team name
As far as the odds not showing. Check the csv file to see if it's there. If it is, might just be a preference you need to change in pycharm (might be just cutting off some of the string)
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
url = 'https://www.ncaagamesim.com/college-basketball-predictions.asp'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table')
# Get column names
headers = table.find_all('th')
cols = [x.text for x in headers]
# Get all rows in table body
table_rows = table.find_all('tr')
rows = []
# Grab the text of each td, and put into a rows list
for each in table_rows[1:]:
odd_avail = True
data = each.find_all('td')
time = data[0].text.strip()
# Get matchup and odds
try:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = float(odds.split('by')[-1].strip())
except:
matchup = data[1].text.strip()
if 'Even' in matchup:
matchup, odds = data[1].text.strip().split('\xa0')
odd_margin = 0
else:
odd_margin = '-'
odd_avail = False
awayTeam = matchup.split('#')[0].strip()
homeTeam = matchup.split('#')[1].strip()
# Get favored team
try:
odd_team_win = data[1].find_all('img')[-1]['title']
except:
odd_team_win = '-'
odd_avail = False
# Get simulation winner
try:
sim_team_win = data[2].find('img')['title']
except:
if 'wins' in data[2].text:
sim_team_win = data[2].text.split('wins')[0].strip()
else:
sim_team_win = '-'
odd_avail = False
# Get simulation margin
try:
sim_margin = float(re.findall("\d+\.\d+", data[2].text)[-1])
except:
sim_margin = '-'
odd_avail = False
# If all variables available, determine odds and simulation margin points
if odd_avail == True:
if odd_team_win == sim_team_win:
diff = abs(sim_margin - odd_margin)
else:
diff = odd_margin + sim_margin
else:
diff = '-'
# Create table
row = {cols[0]: time, 'Away Team': awayTeam, 'Home Team':homeTeam, 'Odds Winner': odd_team_win, 'Odds': odd_margin,
'Simulation Winner': sim_team_win, 'Simulation Margin': sim_margin, 'Diff': diff}
rows.append(row)
df = pd.DataFrame(rows)
print (df.to_string())
# df.to_csv('odds.csv', index=False)
When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
from time import sleep
from random import randint
from googletrans import Translator
translator = Translator()
rg = []
ctr_n = []
ctr = []
yr = []
mn = []
sub = []
cst_n = []
cst = []
mag = []
pty_n = []
pty = []
can = []
pev1 = []
vot1 = []
vv1 = []
ivv1 = []
to1 = []
cv1 = []
cvs1 = []
pv1 = []
pvs1 = []
pev2 = []
vot2 = []
vv2 = []
ivv2 = []
to2 = []
cv2 = []
cvs2 =[]
pv2 = []
pvs2 = []
seat = []
no_info = []
manual = []
START_PAGE = 1
END_PAGE = 42
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("https://sejmsenat2019.pkw.gov.pl/sejmsenat2019/en/wyniki/sejm/okr/" + str(page))
page.encoding = page.apparent_encoding
if not page:
pass
else:
soup = BeautifulSoup(page.text, 'html.parser')
tbody = soup.find_all('table', class_='table table-borderd table-striped table-hover dataTable no-footer clickable right2 right4')
sleep(randint(2,10))
for container in tbody:
col1 = container.find_all('tr', {'data-id':'26079'})
for info in col1:
col_1 = info.find_all('td')
for data in col_1:
party = data[0]
party_trans = translator.translate(party)
pty_n.append(party_trans)
pvotes = data[1]
pv1.append(pvotes)
pshare = data[2]
pvs1.append(pshare)
mandates = data[3]
seat.append(mandates)
col2 = container.find_all('tr', {'data-id':'26075'})
for info in col2:
col_2 = info.find_all('td')
for data in col_2:
party2 = data[0]
party_trans2 = translator.translate(party2)
pty_n.append(party_trans2)
pvotes2 = data[1]
pv1.append(pvotes2)
pshare2 = data[2]
pvs1.append(pshare2)
mandates2 = data[3]
seat.append(mandates2)
col3 = container.find_all('tr', {'data-id':'26063'})
for info in col3:
col_3 = info.find_all('td')
for data in col_3:
party3 = data[0].text
party_trans3 = translator.translate(party3)
pty_n.extend(party_trans3)
pvotes3 = data[1].text
pv1.extend(pvotes3)
pshare3 = data[2].text
pvs1.extend(pshare3)
mandates3 = data[3].text
seat.extend(mandates3)
col4 = container.find_all('tr', {'data-id':'26091'})
for info in col4:
col_4 = info.find_all('td',recursive=True)
for data in col_4:
party4 = data[0]
party_trans4 = translator.translate(party4)
pty_n.extend(party_trans4)
pvotes4 = data[1]
pv1.extend(pvotes4)
pshare4 = data[2]
pvs1.extend(pshare4)
mandates4 = data[3]
seat.extend(mandates4)
col5 = container.find_all('tr', {'data-id':'26073'})
for info in col5:
col_5 = info.find_all('td')
for data in col_5:
party5 = data[0]
party_trans5 = translator.translate(party5)
pty_n.extend(party_trans5)
pvotes5 = data[1]
pv1.extend(pvotes5)
pshare5 = data[2]
pvs1.extend(pshare5)
mandates5 = data[3]
seat.extend(mandates5)
col6 = container.find_all('tr', {'data-id':'26080'})
for info in col6:
col_6 = info.find_all('td')
for data in col_6:
party6 = data[0]
party_trans6 = translator.translate(party6)
pty_n.extend(party_trans6)
pvotes6 = data[1]
pv1.extend(pvotes6)
pshare6 = data[2]
pvs1.extend(pshare6)
mandates6 = data[3]
seat.extend(mandates6)
#### TOTAL VOTES ####
tfoot = soup.find_all('tfoot')
for data in tfoot:
fvote = data.find_all('td')
for info in fvote:
votefinal = info.find(text=True).get_text()
fvoteindiv = [votefinal]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
vot1.extend(fvotelist)
#### CONSTITUENCY NAMES ####
constit = soup.find_all('a', class_='btn btn-link last')
for data in constit:
names = data.get_text()
names_clean = names.replace("Sejum Constituency no.","")
names_clean2 = names_clean.replace("[","")
names_clean3 = names_clean2.replace("]","")
namesfinal = names_clean3.split()[1]
constitindiv = [namesfinal]
constitlist = constitindiv * (len(pty_n) - len(cst_n))
cst_n.extend(constitlist)
#### UNSCRAPABLE INFO ####
region = 'Europe'
reg2 = [region]
reglist = reg2 * (len(pty_n) - len(rg))
rg.extend(reglist)
country = 'Poland'
ctr2 = [country]
ctrlist = ctr2 * (len(pty_n) - len(ctr_n))
ctr_n.extend(ctrlist)
year = '2019'
yr2 = [year]
yrlist = yr2 * (len(pty_n) - len(yr))
yr.extend(yrlist)
month = '10'
mo2 = [month]
molist = mo2 * (len(pty_n) - len(mn))
mn.extend(molist)
codes = ''
codes2 = [codes]
codeslist = codes2 * (len(pty_n) - len(manual))
manual.extend(codeslist)
noinfo = '-990'
noinfo2 = [noinfo]
noinfolist = noinfo2 * (len(pty_n) - len(no_info))
no_info.extend(noinfolist)
print(len(rg), len(pty_n), len(pv1), len(pvs1), len(no_info), len(vot1), len(cst_n))
poland19 = pd.DataFrame({
'rg' : rg,
'ctr_n' : ctr_n,
'ctr': manual,
'yr' : yr,
'mn' : mn,
'sub' : manual,
'cst_n': cst_n,
'cst' : manual,
'mag': manual,
'pty_n': pty_n,
'pty': manual,
'can': can,
'pev1': no_info,
'vot1': vot1,
'vv1': vot1,
'ivv1': no_info,
'to1': no_info,
'cv1': no_info,
'cvs1': no_info,
'pv1': cv1,
'pvs1': cvs1,
'pev2': no_info,
'vot2': no_info,
'vv2': no_info,
'ivv2': no_info,
'to2': no_info,
'cv2': no_info,
'cvs2': no_info,
'pv2' : no_info,
'pvs2' : no_info,
'seat' : manual
})
print(poland19)
poland19.to_csv('poland_19.csv')
As commented you probably need to use Selenium. You could replace the requests lib and replace the request statements with sth like this:
from selenium import webdriver
wd = webdriver.Chrome('pathToChromeDriver') # or any other Browser driver
wd.get(url) # instead of requests.get()
soup = BeautifulSoup(wd.page_source, 'html.parser')
You need to follow the instructions to install and implement the selenium lib at this link: https://selenium-python.readthedocs.io/
Note: I tested your code with selenium and I was able to get the table that you were looking for, but with the class_=... does not work for some reason.
Instead browsing at the scraped data I found that it has an attribute id. So maybe try also this instead:
tbody = soup.find_all('table', id="DataTables_Table_0")
And again, by doing the get requests with the selenium lib.
Hope that was helpful :)
Cheers
I'm getting to error 'find() takes no keyword arguments' on the line of code place = racers.find('td', class_='horse_number').get_text()
I presume this is due to the nested for loop - is find onto find the problem??
My goal is to get detail of the race in first loop, second loop reiterate over each runner within the race, third for loop to get the times that meet each nested if statement.
for race in results:
race_number = race.find('td', class_='raceNumber').get_text()
race_name1 = race.find('td', class_='raceTitle').get_text()
race_title1 = race.find('td', class_='raceInformation').get_text()
race_title1 = ' '.join(race_title1.split())
race_distance1 = race.find('td', class_='distance').get_text()
tableofdata = race.find('table', class_='raceFieldTable')
for racers in tableofdata:
place = racers.find('td', class_='horse_number').get_text()
horsename = racers.find('a', class_='horse_name_link')
horsename = horsename.text.replace('HorseName: ', '') if horsename else ''
prizemoney = racers.find('td', class_='prizemoney')
prizemoney = prizemoney.text.replace('Prizemoney: ', '') if prizemoney else ''
barrier = racers.find('td', class_='barrier')
barrier = barrier.text.replace('Row: ', '') if barrier else ''
#tabnumber = race.find('td', class_='horse_number')
#tabnumber = tabnumber.text.replace('HorseNumber: ', '') if tabnumber else ''
#print(tabnumber, tr2)
trainer = racers.find_all('td', class_='trainer-short')
trainer = trainer.text.replace('Trainer: ', '') if trainer else ''
driver = racers.find_all('td', class_='driver-short')
driver = driver.text.replace('Driver: ', '') if driver else ''
margin = racers.find_all('td', class_='margin')
margin = margin.text.replace('Margin: ', '') if margin else ''
startingprice = racers.find_all('td', class_='starting_price')
startingprice = startingprice.text.replace('StartingOdds: ', '')
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
stewardscomments = racers.find_all('span', class_='stewardsTooltip')
stewardscomments = stewardscomments.text.replace('StewardsComments: ', '') if horsename else ''
scratchingnumber = racers.find_all('td', class_='number')
scratchingnumber = scratchingnumber.text.replace('Scratching: ', '') if scratchingnumber else ''
tableoftimes = race.find('table', class_='raceTimes')
for row in tableoftimes.select('td>strong:contains(":")'):
for t in row:
if "Track Rating:" in t:
trackrating = t.next_element.strip()
else:
trackrating = ''
if "Gross Time:" in t:
grosstime = t.next_element.strip()
else:
grosstime = ''
if "Mile Rate:" in t:
milerate = t.next_element.strip()
else:
milerate = ''
if "Lead Time:" in t:
leadtime = t.next_element.strip()
else:
leadtime = ''
if "First Quarter:" in t:
firstquarter = t.next_element.strip()
else:
firstquarter = ''
if "Second Quarter:" in t:
secondquarter = t.next_element.strip()
else:
secondquarter = ''
if "Third Quarter:" in t:
thirdquarter = t.next_element.strip()
else:
thirdquarter = ''
if "Fourth Quarter:" in t:
fourthquarter = t.next_element.strip()
else:
fourthquarter = ''
Last query is this replace doesnt work - still prints $2.40Â onto csv file
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
....
startingprice = startingprice.replace('Â', ' ')if startingprice else ''
....
writer.writerow([tr2, race_number, race_name1, race_title1, race_distance1, place, horsename, prizemoney, barrier, trainer, driver, margin, startingprice, stewardscomments, scratchingnumber, trackrating, grosstime, milerate, leadtime, firstquarter, secondquarter, thirdquarter, fourthquarter])
UPDATED
Start of HTML with scraping looks like below
from datetime import datetime, date, timedelta
import requests
import re
import csv
import os
import numpy
import pandas as pd
from bs4 import BeautifulSoup as bs
from simplified_scrapy import SimplifiedDoc,req,utils
file = open('harnessresults.csv', 'w', newline='', encoding='utf8')
writer = csv.writer(file)
base_url = "http://www.harness.org.au/racing/results/?firstDate="
base1_url = "http://www.harness.org.au"
webpage_response = requests.get('http://www.harness.org.au/racing/results/?firstDate=')
soup = bs(webpage_response.content, "html.parser")
format = "%d-%m-%y"
delta = timedelta(days=1)
yesterday = datetime.today() - timedelta(days=1)
enddate = datetime(2020, 4, 20)
#prints header in csv
writer.writerow(['Venue', 'RaceNumber', 'RaceName', 'RaceTitle', 'RaceDistance', 'Place', 'HorseName', 'Prizemoney', 'Row', 'Trainer', 'Driver', 'Margin', 'StartingOdds', 'StewardsComments', 'Scratching', 'TrackRating', 'Gross_Time', 'Mile_Rate', 'Lead_Time', 'First_Quarter', 'Second_Quarter', 'Third_Quarter', 'Fourth_Quarter'])
while enddate <= yesterday:
enddate += timedelta(days=1)
enddate1 = enddate.strftime("%d-%m-%y")
new_url = base_url + str(enddate1)
soup12 = requests.get(new_url)
soup1 = bs(soup12.content, "html.parser")
table1 = soup1.find('table', class_='meetingListFull')
tr = table1.find_all('tr', {'class':['odd', 'even']})
for tr1 in tr:
tr2 = tr1.find('a').get_text()
tr3 = tr1.find('a')['href']
newurl = base1_url + tr3
with requests.Session() as s:
webpage_response = s.get(newurl)
soup = bs(webpage_response.content, "html.parser")
#soup1 = soup.select('.content')
results = soup.find_all('div', {'class':'forPrint'})
#resultsv2 = soup.find_all('table', {'class':'raceFieldTable'})
Expect the CSV to look like