Getting a not subscriptable when running a web scraping script - python

I am practicing web scraping and am using this code. I am trying the for loop.
import requests
from bs4 import BeautifulSoup
name=[]
link=[]
address=[]
for i in range (1,11):
i=str(i)
url = "https://forum.iktva.sa/exhibitors-list?&page="+i+"&searchgroup=37D5A2A4-exhibitors"
soup = BeautifulSoup(requests.get(url).content, "html.parser")
for a in soup.select(".m-exhibitors-list__items__item__header__title__link"):
company_url = "https://forum.iktva.sa/" + a["href"].split("'")[1]
soup2 = BeautifulSoup(requests.get(company_url).content, "html.parser")
n=soup2.select_one(".m-exhibitor-entry__item__header__title").text
l=soup2.select_one("h4+a")["href"]
a=soup2.select_one(".m-exhibitor-entry__item__body__contacts__address").text
name.append(n)
link.append(l)
address.append(a)
When I am running the program I am getting this error:
l=soup2.select_one("h4+a")["href"]
TypeError: 'NoneType' object is not subscriptable
If i am not sure how to solve the problem.

You just need to raplace, follwing code to Handle None
l = soup2.select_one("h4+a")
if l:
l = l["href"]
else:
l = "Website not available"
As you can see, Because website is not available for:
https://forum.iktva.sa/exhibitors/sanad
OR you can handle all error like:
import requests
from bs4 import BeautifulSoup
def get_object(obj, attr=None):
try:
if attr:
return obj[attr]
else:
return obj.text
except:
return "Not available"
name = []
link = []
address = []
for i in range(1, 11):
i = str(i)
url = f"https://forum.iktva.sa/exhibitors-list?&page={i}&searchgroup=37D5A2A4-exhibitors"
soup = BeautifulSoup(requests.get(url).text, features="lxml")
for a in soup.select(".m-exhibitors-list__items__item__header__title__link"):
company_url = "https://forum.iktva.sa/" + a["href"].split("'")[1]
soup2 = BeautifulSoup(requests.get(company_url).content, "html.parser")
n = soup2.select_one(".m-exhibitor-entry__item__header__title").text
n = get_object(n)
l = soup2.select_one("h4+a")
l = get_object(l, 'href')
a = soup2.select_one(".m-exhibitor-entry__item__body__contacts__address")
a = get_object(a)
name.append(n)
link.append(l)
address.append(a)

Related

webscraping bus stops with beautifulsoup

I am trying to web scrape bus stop names for a given line, here is an example page for line 212 https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212. I want to have as an output two lists, one with bus stop names in one direction and the other list with another direction. (It's clearly seen on the web page). I managed to get all names in one list with
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
print(soup.prettify())
all_bus_stops = []
table = soup.find_all('a')
for element in table:
if element.get_text() in all_bus_stops:
continue
else:
all_bus_stops.append(element.get_text())
return all_bus_stops
print(download_bus_schedule('212'))
I guess the solution would be to somehow divide the soup into two parts.
You can use the bs4.element.Tag.findAll method:
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
all_bus_stops = []
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content, 'html.parser')
for s in soup.select(".holo-list"):
bus_stops = []
for f in s.findAll("li"):
if f.text not in bus_stops:
bus_stops.append(f.text)
all_bus_stops.append(bus_stops)
return all_bus_stops
print(download_bus_schedule('212'))
Output:
[['Pl.Hallera', 'Pl.Hallera', 'Darwina', 'Namysłowska', 'Rondo Żaba', 'Rogowska', 'Kołowa', 'Dks Targówek', 'Metro Targówek Mieszkaniowy', 'Myszkowska', 'Handlowa', 'Metro Trocka', 'Bieżuńska', 'Jórskiego', 'Łokietka', 'Samarytanka', 'Rolanda', 'Żuromińska', 'Targówek-Ratusz', 'Św.Wincentego', 'Malborska', 'Ch Targówek'],
['Ch Targówek', 'Ch Targówek', 'Malborska', 'Św.Wincentego', 'Targówek-Ratusz', 'Żuromińska', 'Gilarska', 'Rolanda', 'Samarytanka', 'Łokietka', 'Jórskiego', 'Bieżuńska', 'Metro Trocka', 'Metro Trocka', 'Metro Trocka', 'Handlowa', 'Myszkowska', 'Metro Targówek Mieszkaniowy', 'Dks Targówek', 'Kołowa', 'Rogowska', 'Rondo Żaba', '11 Listopada', 'Bródnowska', 'Szymanowskiego', 'Pl.Hallera', 'Pl.Hallera']]
import requests
from bs4 import BeautifulSoup
def download_bus_schedule(bus_number):
URL = "http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=" + bus_number
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
bus_stops_1 = []
bus_stops_2 = []
directions = soup.find_all("ul", {"class":"holo-list"})
for stop in directions[0].find_all("a"):
if stop not in bus_stops_1:
bus_stops_1.append(stop.text.strip())
for stop in directions[1].find_all("a"):
if stop not in bus_stops_2:
bus_stops_2.append(stop.text.strip())
all_bus_stops = (bus_stops_1, bus_stops_2)
return all_bus_stops
print(download_bus_schedule('212')[0])
print(download_bus_schedule('212')[1])
I may have misunderstood as I do not know Polish but see if this helps.
from bs4 import BeautifulSoup
import requests
url = 'https://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l=212'
resp = requests.get(url)
soup = BeautifulSoup(resp.content, "html.parser")
d = {}
for h2 in soup.select('h2.holo-divider'):
d[h2.text] = []
ul = h2.next_sibling
for li in ul.select('li'):
if li.a.text not in d[h2.text]:
d[h2.text].append(li.a.text)
from pprint import pprint
pprint(d)
As all stops are encapsulated in the next un-ordered list, you could use the find_next function of bs4.
e.g.
URL = f"http://www.m2.rozkladzik.pl/warszawa/rozklad_jazdy.html?l={bus_number}"
r = requests.get(URL)
soup = BeautifulSoup(r.content,
'html5lib')
directions = ["Ch Targówek","Pl.Hallera"]
result = {}
for direction in directions:
header = soup.find(text=direction)
list = header.find_next("ul")
stops_names = [stop.get_text() for stop in list]
result[direction] = stops_names
return result
Plus you might want to use f-string to format your strings as it improves reading and is less error prone.

Page loop not working on python webscrape

I'm quite new to python and have written a script using beautifulsoup to parse a website table. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 8 times (number of pages).
Can anyone please help?
Code:
import requests
from bs4 import BeautifulSoup
first_year = 2020
last_year = 2020
for i in range(last_year-first_year+1):
year = str(first_year + i)
print("Running for year:", year)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID="+year).text
soup = BeautifulSoup(text, "html.parser")
options = soup.findAll("option")
opts = []
for option in options:
if not option['value'].startswith("20") and not option['value'].startswith("19") and option["value"]:
opts.append({option["value"]: option.contents[0]})
for opt in opts:
for key, value in opt.items():
print("Doing option:", value)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID=" + year + "&Round=" + key).text
pages_soup = BeautifulSoup(text, "html.parser")
p = pages_soup.findAll("a")
pages = 8
if "&Page=" in str(p[-2]):
pages = int(p[-2].contents[0])
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(text, "html.parser")
tbody = pages_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
Thankyou.
Try this..
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(parse, "html.parser")
tbody = p_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))

How can I crawl all the page?

I am trying to crawl sites's text. But It's only crawling 12 articles.
I don't know why does it do like that. and I wondering If I wanna crawl other pages, What should I do?
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
while x < int(input("end page:")):
x = x + 1
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
print("Total "+str(len(read_more))+" articles"))
Check out the below code, I have made some changes. this will result the required output.
import requests
from bs4 import BeautifulSoup
x = int(input("start page:"))
y = input("end page:")
article_count = 0
while x <= int(y):
url = "https://www.mmtimes.com/national-news.html?page=" + str(x)
result = requests.get(url)
bs_obj = BeautifulSoup(result.content, "html.parser")
content = bs_obj.find("div", {"class": "msp-three-col"})
read_more = content.findAll("div", {"class": "read-more"})
for item in read_more:
atag = item.find('a')
link = "https://www.mmtimes.com" + atag["href"]
linkResult = requests.get(link)
subpage = BeautifulSoup(linkResult.content, "html.parser")
fnresult = subpage.find("div", {"class": "field-item even"})
print(fnresult.text)
article_count += len(read_more)
print("Total "+str(article_count)+" articles")
x += 1

Beautifulsoup python3 Howlongtobeat.com extracting name (and other elements)

Trying to figure out how to extract the name of the game through beautifulsoup
I think i having a problem with the HTML aspect of it
here what I have so far:
from requests import get
url = 'https://howlongtobeat.com/game.php?id=38050'
response = get(url)
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
game_length = html_soup.find_all('div', class_='game_times')
length = (game_length[-1].find_all({'li': ' short time_100 shadow_box'})[-1].contents[3].get_text())
print(length)
game_name = html_soup.find_all('div', class_='profile_header_game')
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
print(game)
I'm getting the length but not the game name why?
for print(length) prints:
31 Hours
but for print(game) prints:
game_name = html_soup.find_all('div', class_='profile_header_game')
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
File "", line 1
game = (game_name[].find({"profile_header shadow_text"})[].contents[].get_text())
^
SyntaxError: invalid syntax
print(game)
Traceback (most recent call last):
File "", line 1, in
NameError: name 'game' is not defined
what am I doing wrong?
It looks like there are a few syntax issues in your code. Here is a corrected version:
from bs4 import BeautifulSoup
import requests
url = 'https://howlongtobeat.com/game.php?id=38050'
response = requests.get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
game_times_tag = html_soup.find('div', class_='game_times')
game_time_list = []
for li_tag in game_times_tag.find_all('li'):
title = li_tag.find('h5').text.strip()
play_time = li_tag.find('div').text.strip()
game_time_list.append((title, play_time))
for game_time in game_time_list:
print(game_time)
profile_header_tag = html_soup.find("div", {"class": "profile_header shadow_text"})
game_name = profile_header_tag.text.strip()
print(game_name)
shorter version
game_length = html_soup.select('div.game_times li div')[-1].text
game_name = html_soup.select('div.profile_header')[0].text
developer = html_soup.find_all('strong', string='\nDeveloper:\n')[0].next_sibling

beautifulSoup does not match chrome inspect while Python webscraping

I am currently trying to webscrape protein sequences off of the ncbi protein database. At this point, the user can search for a protein and I can get the link to the first result that the database spits out. However, when I run this through beautiful soup, the soup does not match the chrome inspect element, nor does it have the sequence at all.
Here is my current code:
import string
import requests
from bs4 import BeautifulSoup
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
newPage = requests.get(newSearchString)
#This is where it fails
newSoup = BeautifulSoup(newPage.text, 'html.parser')
aaList = []
spaceCount = newSoup.count("ff_line")
print(spaceCount)
for i in range(spaceCount):
startIndex = newSoup.find("ff_line")
startIndex = newSoup.find(">", startIndex) + 2
nextAA = newSoup[startIndex]
while nextAA in string.ascii_lowercase:
aaList.append(nextAA)
startIndex += 1
nextAA = newSoup[startIndex]
return aaList
except:
print("Please Enter a Valid Protein")
I have been trying to run it with the search 'p53' and have gotten to the link: here
I have looked at a long series of webscraping entries on this website and tried a lot of things including installing selenium and using different parsers. I am still confused about why these don't match. (Sorry if this is a repeat question, I am very new to webscraping and currently have a concussion so I am looking for a bit of individual case feedback)
This code will extract the protein sequence you want using Selenium. I've modified your original code to give you the result you wanted.
from bs4 import BeautifulSoup
from selenium import webdriver
import requests
driver = webdriver.Firefox()
def getSequence():
searchProt = input("Enter a Protein Name!:")
if searchProt != '':
searchString = "https://www.ncbi.nlm.nih.gov/protein/?term=" + searchProt
page = requests.get(searchString)
soup = BeautifulSoup(page.text, 'html.parser')
soup = str(soup)
accIndex = soup.find("a")
accessionStart = soup.find('<dd>',accIndex)
accessionEnd = soup.find('</dd>', accessionStart + 4)
accession = soup[accessionStart + 4: accessionEnd]
newSearchString = "https://www.ncbi.nlm.nih.gov/protein/" + accession
try:
driver.get(newSearchString)
html = driver.page_source
newSoup = BeautifulSoup(html, "lxml")
ff_tags = newSoup.find_all(class_="ff_line")
aaList = []
for tag in ff_tags:
aaList.append(tag.text.strip().replace(" ",""))
protSeq = "".join(aaList)
return protSeq
except:
print("Please Enter a Valid Protein")
sequence = getSequence()
print(sequence)
Which produces the following output for input of "p53":
meepqsdlsielplsqetfsdlwkllppnnvlstlpssdsieelflsenvtgwledsggalqgvaaaaastaedpvtetpapvasapatpwplsssvpsyktfqgdygfrlgflhsgtaksvtctyspslnklfcqlaktcpvqlwvnstpppgtrvramaiykklqymtevvrrcphherssegdslappqhlirvegnlhaeylddkqtfrhsvvvpyeppevgsdcttihynymcnsscmggmnrrpiltiitledpsgnllgrnsfevricacpgrdrrteeknfqkkgepcpelppksakralptntssspppkkktldgeyftlkirgherfkmfqelnealelkdaqaskgsedngahssylkskkgqsasrlkklmikregpdsd

Categories

Resources