Page loop not working on python webscrape - python

I'm quite new to python and have written a script using beautifulsoup to parse a website table. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 8 times (number of pages).
Can anyone please help?
Code:
import requests
from bs4 import BeautifulSoup
first_year = 2020
last_year = 2020
for i in range(last_year-first_year+1):
year = str(first_year + i)
print("Running for year:", year)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID="+year).text
soup = BeautifulSoup(text, "html.parser")
options = soup.findAll("option")
opts = []
for option in options:
if not option['value'].startswith("20") and not option['value'].startswith("19") and option["value"]:
opts.append({option["value"]: option.contents[0]})
for opt in opts:
for key, value in opt.items():
print("Doing option:", value)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID=" + year + "&Round=" + key).text
pages_soup = BeautifulSoup(text, "html.parser")
p = pages_soup.findAll("a")
pages = 8
if "&Page=" in str(p[-2]):
pages = int(p[-2].contents[0])
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(text, "html.parser")
tbody = pages_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
Thankyou.

Try this..
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(parse, "html.parser")
tbody = p_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))

Related

Getting a not subscriptable when running a web scraping script

I am practicing web scraping and am using this code. I am trying the for loop.
import requests
from bs4 import BeautifulSoup
name=[]
link=[]
address=[]
for i in range (1,11):
i=str(i)
url = "https://forum.iktva.sa/exhibitors-list?&page="+i+"&searchgroup=37D5A2A4-exhibitors"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a in soup.select(".m-exhibitors-list__items__item__header__title__link"):
company_url = "https://forum.iktva.sa/" + a["href"].split("'")[1]
soup2 = BeautifulSoup(requests.get(company_url).content, "html.parser")
n=soup2.select_one(".m-exhibitor-entry__item__header__title").text
l=soup2.select_one("h4+a")["href"]
a=soup2.select_one(".m-exhibitor-entry__item__body__contacts__address").text
name.append(n)
link.append(l)
address.append(a)
When I am running the program I am getting this error:
l=soup2.select_one("h4+a")["href"]
TypeError: 'NoneType' object is not subscriptable
If i am not sure how to solve the problem.
You just need to raplace, follwing code to Handle None
l = soup2.select_one("h4+a")
if l:
l = l["href"]
else:
l = "Website not available"
As you can see, Because website is not available for:
https://forum.iktva.sa/exhibitors/sanad
OR you can handle all error like:
import requests
from bs4 import BeautifulSoup
def get_object(obj, attr=None):
try:
if attr:
return obj[attr]
else:
return obj.text
except:
return "Not available"
name = []
link = []
address = []
for i in range(1, 11):
i = str(i)
url = f"https://forum.iktva.sa/exhibitors-list?&page={i}&searchgroup=37D5A2A4-exhibitors"
soup = BeautifulSoup(requests.get(url).text, features="lxml")
for a in soup.select(".m-exhibitors-list__items__item__header__title__link"):
company_url = "https://forum.iktva.sa/" + a["href"].split("'")[1]
soup2 = BeautifulSoup(requests.get(company_url).content, "html.parser")
n = soup2.select_one(".m-exhibitor-entry__item__header__title").text
n = get_object(n)
l = soup2.select_one("h4+a")
l = get_object(l, 'href')
a = soup2.select_one(".m-exhibitor-entry__item__body__contacts__address")
a = get_object(a)
name.append(n)
link.append(l)
address.append(a)

Using an if loop to exclude entries containing a substring

I'm trying to scrape the results & statistics from the last 4 seasons of snooker from https://cuetracker.net.
I succeeded (somewhat) in scraping most of the data however I neglected to recognise that there are walkovers included. These walkovers contain only nationality,player name and score data and no stats and therefore as I have scraped my data in lists they do not correctly align when I convert them into a DataFrame.
I am trying to use a if in statement in my loop in order to try and skip these matches that are walkovers and therefore realign the correct stats with the correct match. I'm trying to use the word 'Walkover' to stop the item being appended to my list.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 15
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
chrome_path = r"C:\Users\George\Desktop\chromedriver.exe"
browser = webdriver.Chrome(chrome_path)
page_source = browser.page_source
browser.get("https://cuetracker.net/seasons")
links = browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:2]
hrefs2 = []
for href in hrefs:
browser.get(href)
wait_for_page_load()
links2 = browser.find_elements_by_xpath('.//tr/td[2]/a')
for link in links2:
hrefs2.append((link.get_attribute("href")))
player_1_nationality = []
player_1_name = []
player_1_score = []
player_2_score = []
player_2_nationality = []
player_2_name = []
date_played = []
player_1_points_scored = []
player_2_points_scored = []
player_1_fifty_plus_breaks = []
player_2_fifty_plus_breaks = []
match_progress = []
player_1_points_per_frame = []
player_2_points_per_frame = []
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_nationality= browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_nationality:
player_1_nationality.append(lis.get_attribute("alt"))
list_1_player = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/a')
for li in list_1_player:
player_1_name.append(li.get_attribute('text'))
list_2_nationality = browser.find_elements_by_xpath('.//div/div[2]/div[3]/img')
for nationality_2 in list_2_nationality:
player_2_nationality.append(nationality_2.get_attribute("alt"))
list_2_name = browser.find_elements_by_xpath('.//div/div[2]/div[3]/a')
for name_2 in list_2_name:
player_2_name.append(name_2.get_attribute('text'))
list_1_score = browser.find_elements_by_xpath('.//div/div[2]/div[2]/span[1]/b')
for score in list_1_score:
player_1_score.append(score.get_attribute('innerText'))
list_2_score = browser.find_elements_by_xpath('.//div/div[2]/div[2]/span[3]')
for score_2 in list_2_score:
player_2_score.append(score_2.get_attribute('innerText'))
#list_date_played = browser.find_elements_by_xpath('.//div[4]/div[2]/div/div')
#for date in list_date_played:
#date_played.append(date.get_attribute('innerText'))
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'lxml')
points_scored_elem = soup.find_all('div', text='Points Scored')
for elem in points_scored_elem:
player_1_points_scored.append(elem.find_next('div').find_next('div').find_next('div').get_text())
points_scored_2_elem = soup.find_all('div', text='Points Scored')
for elem in points_scored_2_elem:
player_2_points_scored.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
fifty_plus_breaks_elem = soup.find_all('div', text='50+ Breaks')
for elem in fifty_plus_breaks_elem:
player_1_fifty_plus_breaks.append(elem.find_next('div').find_next('div').find_next('div').get_text())
fifty_plus_breaks_2_elem = soup.find_all('div', text='50+ Breaks')
for elem in fifty_plus_breaks_2_elem:
player_2_fifty_plus_breaks.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
match_progress_elem = soup.find_all('div', text='Match progress')
for elem in match_progress_elem:
match_progress.append(elem.find_next('div').find_next('div').find_next('div').get_text(strip=True))
points_per_frame_elem = soup.find_all('div', text='points/frame')
for elem in points_scored_elem:
player_1_points_per_frame.append(elem.find_next('div').find_next('div').find_next('div').get_text())
points_per_frame_2_elem = soup.find_all('div', text='Avg. points/frame')
for elem in points_scored_elem:
player_2_points_per_frame.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
points_scored_2_elem = soup.find_all('div', text='Avg. points/frame')
for elem in points_scored_2_elem:
player_2_points_per_frame.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
list_date_played = soup.find_all('div', text='Played on')
for date in list_date_played:
date_played.append(date.find_next('div').find_next('div').find_next('div').get_text(strip=True))
The above code is the bulk of my scraping code. Below is what I'm trying to test in order to skip the walkovers. I'm trying to use the parent of the img as it includes the 'Walkover' string within it's innerText.
for href in hrefs2:
browser.get(href)
wait_for_page_load()
page_source = browser.page_source
list_3_nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_nationality:
check = lis.find_element_by_xpath(("./parent::b"))
check = check.get_attribute('innerText')
if 'Walkover' in check:
continue
else:
player_1_nationality.append(lis.get_attribute('alt'))
For some reason this doesn't seem to be working?

How can I crawl all the page?

I am trying to crawl sites's text. But It's only crawling 12 articles.
I don't know why does it do like that. and I wondering If I wanna crawl other pages, What should I do?
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
while x < int(input("end page:")):
x = x + 1
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
print("Total "+str(len(read_more))+" articles"))
Check out the below code, I have made some changes. this will result the required output.
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
y = input("end page:")
article_count = 0
while x <= int(y):
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
article_count += len(read_more)
print("Total "+str(article_count)+" articles")
x += 1

How to update scraping link in for loop

I developed this program to scrape newegg for ps4 prices. However I want to scrape multiple pages. Here is what I have but once it scrapes the first page the program stops. Basically I am trying to change the link so 'pages-1' changes to 2,3,4 etc. Is there a better way to do this?
from bs4 import BeautifulSoup
import requests
import csv
page_num = 1
prod_num = 0
source = requests.get('https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-' + str(page_num) + '?PageSize=36&order=BESTMATCH').text
soup = BeautifulSoup(source, 'lxml')
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
for info in soup.find_all('div', class_='item-container'):
prod = info.find('a', class_='item-title').text.strip()
price = info.find('li', class_='price-current').text.strip().splitlines()[1].replace(u'\xa0', '')
if u'$' not in price:
price = info.find('li', class_='price-current').text.strip().splitlines()[0].replace(u'\xa0', '')
ship = info.find('li', class_='price-ship').text.strip()
print(prod)
print(price)
print(ship)
csv_writer.writerow([prod, price, ship])
prod_num += 1
if prod_num > 35: #there is about 35 items per newegg page
page_num += 1
# print(price.splitlines()[1])
print('-----------')
csv_file.close()
i found the page limit num here
and i think you can get the page limit by xpath or other ways:
# xpath syntax may like this
# //span[#class='list-tool-pagination-text']
hope it's useful for you
If you noticed, Next "button" tag of last page has attribute "disabled", So [tag_name].has_attr('disabled') return True . Using this you can manage pagination.
import requests
from bs4 import BeautifulSoup
import csv
csv_file = open('newegg_scrape.csv', 'w')
csv_writer = csv.writer(csv_file)
csv_writer.writerow(['Product', 'Price', 'Shipping_info'])
URL_PART1 = "https://www.newegg.com/PS4-Systems/SubCategory/ID-3102/Page-"
URL_PART2 = "?PageSize=36&order=BESTMATCH"
PAGE_NO = 1
url = URL_PART1 + str(PAGE_NO) + URL_PART2
while len(url):
PAGE_NO+=1
resp = requests.get(url)
soup = BeautifulSoup(resp.text, 'html.parser')
all_divs = soup.find_all('div', attrs={'class':'item-info'})
for item in all_divs:
prod = ""
price = ""
ship = ""
# get product name
prod = item.find('a', attrs={'class':'item-title'})
if prod:
prod = prod.text.strip()
# get price
price_part = item.find('li', attrs={'class':'price-current'})
if price_part:
price_part1 = price_part.strong
if price_part1:
price_part1 = price_part1.text.strip()
price_part2 = price_part.sup
if price_part2:
price_part2 = price_part2.text.strip()
if price_part1 and price_part2:
price = price_part1 + price_part2
# get shipping info
ship = item.find('li', attrs={'class':'price-ship'})
if ship:
ship = ship.text.strip()
csv_writer.writerow([prod, price, ship])
# manage pagination
next_button = soup.find('button', attrs={'title': 'Next'})
if not(next_button.has_attr('disabled')):
url = URL_PART1 + str(PAGE_NO) + URL_PART2
else:
url = ""

beautifulSoup does not match chrome inspect while Python webscraping

I am currently trying to webscrape protein sequences off of the ncbi protein database. At this point, the user can search for a protein and I can get the link to the first result that the database spits out. However, when I run this through beautiful soup, the soup does not match the chrome inspect element, nor does it have the sequence at all.
Here is my current code:
import string
import requests
from bs4 import BeautifulSoup
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
newPage = requests.get(newSearchString)
#This is where it fails
newSoup = BeautifulSoup(newPage.text, 'html.parser')
aaList = []
spaceCount = newSoup.count("ff_line")
print(spaceCount)
for i in range(spaceCount):
startIndex = newSoup.find("ff_line")
startIndex = newSoup.find(">", startIndex) + 2
nextAA = newSoup[startIndex]
while nextAA in string.ascii_lowercase:
aaList.append(nextAA)
startIndex += 1
nextAA = newSoup[startIndex]
return aaList
except:
print("Please Enter a Valid Protein")
I have been trying to run it with the search 'p53' and have gotten to the link: here
I have looked at a long series of webscraping entries on this website and tried a lot of things including installing selenium and using different parsers. I am still confused about why these don't match. (Sorry if this is a repeat question, I am very new to webscraping and currently have a concussion so I am looking for a bit of individual case feedback)
This code will extract the protein sequence you want using Selenium. I've modified your original code to give you the result you wanted.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
driver = webdriver.Firefox()
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
driver.get(newSearchString)
html = driver.page_source
newSoup = BeautifulSoup(html, "lxml")
ff_tags = newSoup.find_all(class_="ff_line")
aaList = []
for tag in ff_tags:
aaList.append(tag.text.strip().replace(" ",""))
protSeq = "".join(aaList)
return protSeq
except:
print("Please Enter a Valid Protein")
sequence = getSequence()
print(sequence)
Which produces the following output for input of "p53":
meepqsdlsielplsqetfsdlwkllppnnvlstlpssdsieelflsenvtgwledsggalqgvaaaaastaedpvtetpapvasapatpwplsssvpsyktfqgdygfrlgflhsgtaksvtctyspslnklfcqlaktcpvqlwvnstpppgtrvramaiykklqymtevvrrcphherssegdslappqhlirvegnlhaeylddkqtfrhsvvvpyeppevgsdcttihynymcnsscmggmnrrpiltiitledpsgnllgrnsfevricacpgrdrrteeknfqkkgepcpelppksakralptntssspppkkktldgeyftlkirgherfkmfqelnealelkdaqaskgsedngahssylkskkgqsasrlkklmikregpdsd

Categories

Resources