attribute error in web scraping in python - python

Written some code to scrape a website: https://books.toscrape.com/catalogue/page-1.html but I'm getting an error:
Nontype object has no attribute text
Failed to find a solution for this so how can I can fix this error?
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_books=[]
url='https://books.toscrape.com/catalogue/page-1.html'
headers=('https://developers.whatismybrowser.com/useragents/parse/22526098chrome-windows-blink')
def get_page(url):
page=requests.get(url,headers)
status=page.status_code
soup=BeautifulSoup(page.text,'html.parser')
return [soup,status]
#get all books links
def get_links(soup):
links=[]
listings=soup.find_all(class_='product_pod')
for listing in listings:
bk_link=listing.find("h3").a.get("href")
base_url='https://books.toscrape.com/catalogue/page-1.html'
cmplt_link=base_url+bk_link
links.append(cmplt_link)
return links
#extraxt info from each link
def extract_info(links):
for link in links:
r=requests.get(link).text
book_soup=BeautifulSoup(r,'html.parser')
name=book_soup.find(class_='col-sm-6 product_main').text.strip()
price=book_soup.find(class_='col-sm-6 product_main').text.strip()
desc=book_soup.find(class_='sub-header').text.strip()
cat=book_soup.find('"../category/books/poetry_23/index.html">Poetry').text.strip()
book={'name':name,'price':price,'desc':desc,'cat':cat}
all_books.append(book)
pg=48
while True:
url=f'https://books.toscrape.com/catalogue/page-{pg}.html'
soup_status=get_page(url)
if soup_status[1]==200:
print(f"scrapping page{pg}")
extract_info(get_links(soup_status[0]))
pg+=1
else:
print("The End")
break
df=pd.DataFrame(all_books)
print(df)

Note First of all, always take a look at your soup - therein lies the truth. The contents can always differ slightly to extremely from the view in the dev tools.
What happens?
There are different issues you should keep in mind:
base_url='https://books.toscrape.com/catalogue/page-1.html' will lead to 404 errors and is the first reason causing your "Nontype object has no attribute text"
You try to find the category like this cat=book_soup.find('"../category/books/poetry_23/index.html">Poetry').text.strip() what won't work to and will lead to same error
There some more selection that will not lead to an expected result, take a look in my example edited them to give you a clue how to get the goal.
How to fix?
Change base_url='https://books.toscrape.com/catalogue/page-1.html' to base_url='https://books.toscrape.com/catalogue/'
Select the category more specific, it is the last <a> in breadcrumb:
cat=book_soup.select('.breadcrumb a')[-1].text.strip()
Example
import requests
from bs4 import BeautifulSoup
import pandas as pd
all_books=[]
url='https://books.toscrape.com/catalogue/page-1.html'
headers=('https://developers.whatismybrowser.com/useragents/parse/22526098chrome-windows-blink')
def get_page(url):
page=requests.get(url,headers)
status=page.status_code
soup=BeautifulSoup(page.text,'html.parser')
return [soup,status]
#get all books links
def get_links(soup):
links=[]
listings=soup.find_all(class_='product_pod')
for listing in listings:
bk_link=listing.find("h3").a.get("href")
base_url='https://books.toscrape.com/catalogue/'
cmplt_link=base_url+bk_link
links.append(cmplt_link)
return links
#extraxt info from each link
def extract_info(links):
for link in links:
r=requests.get(link).text
book_soup=BeautifulSoup(r,'html.parser')
name= name.text.strip() if (name := book_soup.h1) else None
price= price.text.strip() if (price := book_soup.select_one('h1 + p')) else None
desc= desc.text.strip() if (desc := book_soup.select_one('#product_description + p')) else None
cat= cat.text.strip() if (cat := book_soup.select('.breadcrumb a')[-1]) else None
book={'name':name,'price':price,'desc':desc,'cat':cat}
all_books.append(book)
pg=48
while True:
url=f'https://books.toscrape.com/catalogue/page-{pg}.html'
soup_status=get_page(url)
if soup_status[1]==200:
print(f"scrapping page{pg}")
extract_info(get_links(soup_status[0]))
pg+=1
else:
print("The End")
break
all_books

Use the function below when you need to grab the text of element.
It will protect you from None elements
def get_text(book_soup,clazz):
ele = book_soup.find(class_=clazz)
return ele.text.strip() if ele is not None else ''
Example.Instead of
name=book_soup.find(class_='col-sm-6 product_main').text.strip()
do
name=get_text(book_soup,'col-sm-6 product_main')

Related

NoneType error Problem in BeautifulSoup in python

im begginer at programming, so i have problem in find method in beautifuloup when i use it in web scraping,i have this code
from os import execle, link, unlink, write
from typing import Text
import requests
from bs4 import BeautifulSoup
import csv
from itertools import zip_longest
job_titleL =[]
company_nameL=[]
location_nameL=[]
experience_inL=[]
links=[]
salary=[]
job_requirementsL=[]
date=[]
result= requests.get(f"https://wuzzuf.net/search/jobs/?a=%7B%7D&q=python&start=1")
source = result.content
soup= BeautifulSoup(source , "lxml")
job_titles = soup.find_all("h2",{"class":"css-m604qf"} )
companies_names = soup.find_all("a",{"class":"css-17s97q8"})
locations_names = soup.find_all("span",{"class":"css-5wys0k"})
experience_in = soup.find_all("a", {"class":"css-5x9pm1"})
posted_new = soup.find_all("div",{"class":"css-4c4ojb"})
posted_old = soup.find_all("div",{"class":"css-do6t5g"})
posted = [*posted_new,*posted_old]
for L in range(len(job_titles)):
job_titleL.append(job_titles[L].text)
links.append(job_titles[L].find('a').attrs['href'])
company_nameL.append(companies_names[L].text)
location_nameL.append(locations_names[L].text)
experience_inL.append(experience_in[L].text)
date_text=posted[L].text.replace("-","").strip()
date.append(posted[L].text)
for link in links:
result= requests.get(link)
source= result.content
soup=BeautifulSoup(source,"lxml")
requirements=soup.find("div",{"class":"css-1t5f0fr"}).ul
requirements1=soup.find("div",{"class":"css-1t5f0fr"}).p
respon_text=""
if requirements:
for li in requirements.find_all("li"):
print(li)
if requirements1:
for br in requirements1.find_all("br"):
print(br)
respon_text +=li.text + "|"
job_requirementsL.append(respon_text)
file_list=[job_titleL,company_nameL,date,location_nameL,experience_inL,links,job_requirementsL]
exported=zip_longest(*file_list)
with open('newspeard2.csv',"w") as spreadsheet:
wr=csv.writer(spreadsheet)
wr.writerow(["job title", "company name","date", "location", "experience in","links","job requirements"])
wr.writerows(exported)
note: im not very good at english :(
so when i use find method to get the job requirements from each job in the website page (wuzzuf),use for loop to loop throug each text i job requirements, it returns error says:"NoneType objects han nod attribute find_all("li"), so after searching why this happens ,and after dowing inspect for each job page , i found that some job pages uses (br, p and strong) tags in job requirements, i didn't know what to do , but i used if statement to test it, it returns the tags but br tag is empty without text , so please can you see where is the prblem and answer me , thanks
the webpage:
https://wuzzuf.net/search/jobs/?a=hpb&q=python&start=1
the job used p and br tags:
https://wuzzuf.net/jobs/p/T9WuTpM3Mveq-Senior-Data-Scientist-Evolvice-GmbH-Cairo-Egypt?o=28&l=sp&t=sj&a=python|search-v3|hpb
sorry i didn't understand the problem with p sooner
for link in links:
result= requests.get(link)
source= result.content
soup=BeautifulSoup(source,"lxml")
requirements_div=soup.find("div",{"class":"css-1t5f0fr"})
respon_text=[]
for child in requirements_div.children:
if child.name=='ul':
for li in child.find_all("li"):
respon_text.append(li.text)
elif child.name=='p':
for x in child.contents:
if x.name == 'br':
pass
elif x.name == 'strong':
respon_text.append(x.text)
else:
respon_text.append(x)
job_requirementsL.append('|'.join(respon_text))

How to get all emails from a page individually

I am trying to get all emails from a specific page and separate them into an individual variable or even better a dictionary. This is some code.
import requests
import re
import json
from bs4 import BeautifulSoup
page = "http://www.example.net"
info = requests.get(page)
if info.status_code == 200:
print("Page accessed")
else:
print("Error accessing page")
code = info.content
soup = BeautifulSoup(code, 'lxml')
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
print(allEmails)
sep = ","
allEmailsStr = str(allEmails)
print(type(allEmails))
print(type(allEmailsStr))
j = allEmailsStr.split(sep, 1)[0]
print(j)
Excuse the poor variable names because I put this together so it would be fine by itself. The output from the example website would be for example something like
[k, kolyma, location, balkans]
So if I ran the problem it would return only
[k
But if I wanted it to return every email on there individually how would I do that?
To get just the email str you can try:
emails = []
for email_link in allEmails:
emails.append(email_link.get("href").replace('mailto:', ''))
print(emails)
Based on your expected output, you can use the unwrap function of BeautifulSoup
allEmails = soup.find_all("a", href=re.compile(r"^mailto:"))
for Email in allEmails:
print(Email.unwrap()) #This will print the whole element along with tag
# k

AttributeError: 'NoneType' object has no attribute 'tbody' - Spyder 3.3.1 / beautifulsoup4 / python 3.6

Hey this is my setup: Spyder 3.3.1 / beautifulsoup4 / python 3.6
The below code is from an article on medium (here) about webscraping with python and Beautifulsoup. Was supposed to be a quick read but now TWO days later I still cant not get the code to run in spyder and keep getting:
File "/Users/xxxxxxx/Documents/testdir/swiftScrape.py", line 9, in table_to_df
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
AttributeError: 'NoneType' object has no attribute 'tbody'
Not sure what is going wrong and seems to be an implementation error. Can anyone assist in sheding some light on this issue.
Thanks in advance.
import os
import bs4
import requests
import pandas as pd
PATH = os.path.join("C:\\","Users","xxxxx","Documents","tesdir")
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=None, sep=';', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
Like a lost of example code found on the web, this code is not production-grade code - it blindly assumes that http requests always succeed and returns the expected content. The truth is that it's quite often not the case (network errors, proxies or firewall that blocks you, site down - temporarily or definitely, updates in the site that changed either the urls and/or the page's markup etc).
Your problem manifests itself here:
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
and comes from table actually being None, which means that here in the for loop:
table = soup.find(name='table', attrs={'id':'tableID'})
there was no "table" tag with id "tableID" found in the html document. You can check this by printing the actual html content:
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continye
# etc
Thanks #bruno desthuilliers for your pointers. Much appreciated.
This is the rewritten code that worked for me using Selenium and webdriver rather than import requests:
import os
import bs4
import pandas as pd
from selenium import webdriver
PATH = os.path.join('/','Users','benmorris','documents','testdir')
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.find_all('td')] for row in soup.find_all('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
driver = webdriver.Chrome()
driver.get(url)
while True:
print(counter)
page = driver.get(url)
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
table = driver.find_element_by_xpath('//*[#id="tableID"]')
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continue
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=False, sep=',', encoding='iso-8859-1')
url = next_page(soup)
counter += 1

Web crawler does not open all links in a page

I'am trying to build a web crawler using beautifulsoup and urllib. The crawler is working, but it does not open all the pages in a site. It opens the first link and goes to that link, opens the first link of that page and so on.
Here's my code:
from bs4 import BeautifulSoup
from urllib.request import urlopen
from urllib.parse import urljoin
import json, sys
sys.setrecursionlimit(10000)
url = input('enter url ')
d = {}
d_2 = {}
l = []
url_base = url
count = 0
def f(url):
global count
global url_base
if count <= 100:
print("count: " + str(count))
print('now looking into: '+url+'\n')
count += 1
l.append(url)
html = urlopen(url).read()
soup = BeautifulSoup(html, "html.parser")
d[count] = soup
tags = soup('a')
for tag in tags:
meow = tag.get('href',None)
if (urljoin(url, meow) in l):
print("Skipping this one: " + urljoin(url,meow))
elif "mailto" in urljoin(url,meow):
print("Skipping this one with a mailer")
elif meow == None:
print("skipping 'None'")
elif meow.startswith('http') == False:
f(urljoin(url, meow))
else:
f(meow)
else:
return
f(url)
print('\n\n\n\n\n')
print('Scrapping Completed')
print('\n\n\n\n\n')
The reason you're seeing this behavior is due to when the code recursively calls your function. As soon as the code finds a valid link, the function f gets called again preventing the rest of the for loop from running until it returns.
What you're doing is a depth first search, but the internet is very deep. You want to do a breadth first search instead.
Probably the easiest way to modify your code to do that is to have a global list of links to follow. Have the for loop append all the scraped links to the end of this list and then outside of the for loop, remove the first element of the list and follow that link.
You may have to change your logic slightly for your max count.
If count reaches 100, no further links will be opened. Therefore I think you should decrease count by one after leaving the for loop. If you do this, count would be something like the current link depth (and 100 would be the maximum link depth).
If the variable count should refer to the number of opened links, then you might want to control the link depth in another way.

Recursive function gives no output

I'm scraping all the URL of my domain with recursive function.
But it outputs nothing, without any error.
#usr/bin/python
from bs4 import BeautifulSoup
import requests
import tldextract
def scrape(url):
for links in url:
main_domain = tldextract.extract(links)
r = requests.get(links)
data = r.text
soup = BeautifulSoup(data)
for href in soup.find_all('a'):
href = href.get('href')
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == main_domain.domain :
problem.append(href)
elif not href == '#' and link_domain.tld == '':
new = 'http://www.'+ main_domain.domain + '.' + main_domain.tld + '/' + href
problem.append(new)
return len(problem)
return scrape(problem)
problem = ["http://xyzdomain.com"]
print(scrape(problem))
When I create a new list, it works, but I don't want to make a list every time for every loop.
You need to structure your code so that it meets the pattern for recursion as your current code doesn't - you also should not call variables the same name as libraries, e.g. href = href.get() because this will usually stop the library working as it becomes the variable, your code as it currently is will only ever return the len() as this return is unconditionally reached before: return scrap(problem).:
def Recursive(Factorable_problem)
if Factorable_problem is Simplest_Case:
return AnswerToSimplestCase
else:
return Rule_For_Generating_From_Simpler_Case(Recursive(Simpler_Case))
for example:
def Factorial(n):
""" Recursively Generate Factorials """
if n < 2:
return 1
else:
return n * Factorial(n-1)
Hello I've made a none recursive version of this that appears to get all the links on the same domain.
The code below I've tested using the problem included in the code. When I'd solved the problems with the recursive version the next problem was hitting the recursion depth limit so I rewrote it so it ran in an iterative fashion, the code and result below:
from bs4 import BeautifulSoup
import requests
import tldextract
def print_domain_info(d):
print "Main Domain:{0} \nSub Domain:{1} \nSuffix:{2}".format(d.domain,d.subdomain,d.suffix)
SEARCHED_URLS = []
problem = [ "http://Noelkd.neocities.org/", "http://youpi.neocities.org/"]
while problem:
# Get a link from the stack of links
link = problem.pop()
# Check we haven't been to this address before
if link in SEARCHED_URLS:
continue
# We don't want to come back here again after this point
SEARCHED_URLS.append(link)
# Try and get the website
try:
req = requests.get(link)
except:
# If its not working i don't care for it
print "borked website found: {0}".format(link)
continue
# Now we get to this point worth printing something
print "Trying to parse:{0}".format(link)
print "Status Code:{0} Thats: {1}".format(req.status_code, "A-OK" if req.status_code == 200 else "SOMTHINGS UP" )
# Get the domain info
dInfo = tldextract.extract(link)
print_domain_info(dInfo)
# I like utf-8
data = req.text.encode("utf-8")
print "Lenght Of Data Retrived:{0}".format(len(data)) # More info
soup = BeautifulSoup(data) # This was here before so i left it.
print "Found {0} link{1}".format(len(soup.find_all('a')),"s" if len(soup.find_all('a')) > 1 else "")
FOUND_THIS_ITERATION = [] # Getting the same links over and over was boring
found_links = [x for x in soup.find_all('a') if x.get('href') not in SEARCHED_URLS] # Find me all the links i don't got
for href in found_links:
href = href.get('href') # You wrote this seems to work well
if not href:
continue
link_domain = tldextract.extract(href)
if link_domain.domain == dInfo.domain: # JUST FINDING STUFF ON SAME DOMAIN RIGHT?!
if href not in FOUND_THIS_ITERATION: # I'ma check you out next time
print "Check out this link: {0}".format(href)
print_domain_info(link_domain)
FOUND_THIS_ITERATION.append(href)
problem.append(href)
else: # I got you already
print "DUPE LINK!"
else:
print "Not on same domain moving on"
# Count down
print "We have {0} more sites to search".format(len(problem))
if problem:
continue
else:
print "Its been fun"
print "Lets see the URLS we've visited:"
for url in SEARCHED_URLS:
print url
Which prints, after a lot of other logging loads of neocities websites!
What's happening is the script is popping a value of the list of websites yet to visit, it then gets all the links on the page which are on the same domain. If those links are to pages we haven't visited we add the link to the list of links to be visited. After we do that we pop the next page and do the same thing again until there are no pages left to visit.
Think this is what your looking for, get back to us in the comments if this doesn't work in the way that you want or if anyone can improve please leave a comment.

Categories

Resources