I want to scrape the s in a specific url = https://www.sortlist.fr/pub and select the specific one where the following value is found = "Dupont Lewis". I am using the following code :
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.sortlist.fr/pub")
BeautifulSoup.find_all("a")
BeautifulSoup("a")
search = BeautifulSoup.select_one("a[title*=Dupont Lewis]")
if len(search)>0:
print ('I find')
else:
print ('None')
But I get the following error ="AttributeError: 'str' object has no attribute 'descendants'"
Can anyone help me please?
The error is that you don't create a soup from server response:
import requests
from bs4 import BeautifulSoup
r = requests.get("https://www.sortlist.fr/pub")
soup = BeautifulSoup(r.content, 'html.parser') # <-- create a soup
search = soup.select_one('a[title*="Dupont Lewis"]') # <-- put "Dupont Lewis" inside ""
if search: # <-- len() isn't necessary, because of .select_one
print ('I find')
else:
print ('None')
Prints:
I find
Related
I'm new at using python and I'm trying to make a web scraper for an internship
from typing import Container
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
p1 = ["https://www.libris.ro/search?iv.q={}", "https://carturesti.ro/product/search/{}", "https://www.elefant.ro/search?SearchTerm={}&StockAvailability=true", "https://www.litera.ro/catalogsearch/result/?q{}", "https://www.librariadelfin.ro/?submitted=1&O=search&keywords{}&do_submit=1", "https://bookzone.ro/cautare?term={}", "https://www.librex.ro/search/{}/?q={}"]
#price_min = 1000000
#url_min, price_min
title = "percy jackson"
for x in p1:
temp = x
title = title.replace(" ", "+")
url = temp.format(title)
if url == "https://www.libris.ro/search?iv.q=" + title :
**books = bs.find_all("div", class_="product-item-info imgdim-x")**
for each_book in books:
book_url = each_book.find("a")["href"]
price = each_book.find("span", class_="price-wrapper")
print(book_url)
print(price)
and I'm getting this error for the text between the 2 asterisk :
Exception has occurred: AttributeError
'str' object has no attribute 'descendants'
After from bs4 import BeautifulSoup as bs, bs is the class. You need to instantiate that class with data from the web site. In the code below, I've add a requests call to get the page and have built the beautifulsoup doc from there. You'll find some other errors in your code that need to be sorted out, but it will get you past this problem.
from typing import Container
import requests
from bs4 import BeautifulSoup as bs
from selenium import webdriver
p1 = ["https://www.libris.ro/search?iv.q={}", "https://carturesti.ro/product/search/{}", "https://www.elefant.ro/search?SearchTerm={}&StockAvailability=true", "https://www.litera.ro/catalogsearch/result/?q{}", "https://www.librariadelfin.ro/?submitted=1&O=search&keywords{}&do_submit=1", "https://bookzone.ro/cautare?term={}", "https://www.librex.ro/search/{}/?q={}"]
#price_min = 1000000
#url_min, price_min
title = "percy jackson"
for x in p1:
temp = x
title = title.replace(" ", "+")
url = temp.format(title)
if url == "https://www.libris.ro/search?iv.q=" + title :
# THE FIX
resp = requests.get(url)
if not 200 <= resp.status_code < 299:
print("failed", resp.status_code, url)
continue
doc = bs(resp.text, "html.parser")
books = doc.find_all("div", class_="product-item-info imgdim-x")
for each_book in books:
book_url = each_book.find("a")["href"]
price = each_book.find("span", class_="price-wrapper")
print(book_url)
print(price)
I try to scam URL link form google. Users can input any search then they can take a URL link. but here is the main problem is this split function can't work. I can't fix it. So please help me
[[Suppose: Any user can input "all useless website" that time google can showing us a result. User can take only URL link.]]
from typing import re
from bs4 import BeautifulSoup
import requests
user_input = input('Enter value for search : ')
print('Please Wait')
page_source = requests.get("https://www.google.com/search?q=" + user_input)
soup = BeautifulSoup(page_source.text, 'html.parser')
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)
all_links = soup.find_all('a')
for link in all_links:
link_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
print(link_google.find["a"])
You're importing re from the wrong place. You need to use it via import re, as follows:
import re
...
link_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
Update to make your code work:
import re correctly
fix this line from all_links = soup.find_all('a') to all_links = soup.find_all('a', href=True)
Take the link and clean it up like you did before (re.split() works perfectly but it returns a list) and add that link to a list (unpack the list) or print it
Here is the code updated to make it work
# issue 1
import re
from bs4 import BeautifulSoup
import requests
user_input = input('Enter value for search : ')
print('Please Wait')
page_source = requests.get("https://www.google.com/search?q=" + user_input)
soup = BeautifulSoup(page_source.text, 'html.parser')
print(soup.title)
print(soup.title.string)
print(soup.title.parent.name)
# issue 2
all_links = soup.find_all('a', href=True)
for link in all_links:
link_from_google = re.split(":(?=http)", link["href"].replace("/url?q=", ""))
# issue 3
print(link_from_google[0])
>>> {returns all the http links}
One liner list comprehension for fun
google_links = [re.split(":(?=http)", link["href"].replace("/url?q=", ""))[0] for link in soup.find_all('a', href=True)]
>>> {returns all the http links}
I want to crowling in a bank site but google chrome driver doesn't work.
And I want to print as 'null' if the type is 'NoneType'
I wrote this code in jupyter notebook(anaconda3).
I tried to solve this problem using 'if...else...'
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium import webdriver
driver = webdriver.Chrome('./chromedriver.exe') html_dr = driver.execute_script('return document.body.innerHTML') #question
url_base = 'https://www.kebhana.com/cont/mall/mall08/mall0805/index.jsp?_menuNo=62608'
html = urlopen(url_base) soup = BeautifulSoup(html, "html.parser")
soup
print(soup.find_all('li', 'item type1'))
len(soup.find_all('li', 'item type1'))
from urllib.request import urljoin
name = [] url_add = [] period = [] strong = [] term = []
list_soup = soup.find_all('li', 'item type1')
for item in list_soup: name.append(item.find('a').get_text()) url_add.append(urljoin(url_base, item.find('a')['href'])) #question # # #'NoneType' object has no attribute 'get_text' #if item is not None: #period.append(item.find(class_='period').get_text()) #strong.append(item.find('strong')) #term.append(item.find(class_='term')) #else: #print("null")
url_add[:5]
name[:5]
period[:23]
strong[:23]
import pandas as pd
data = {'Name':name, 'URL':url_add} df = pd.DataFrame(data) df.head() df.to_csv('./hana_list.csv', sep=',', encoding='EUC-KR')
df['URL'][0]
html = urlopen(df['URL'][0]) soup_tmp = BeautifulSoup(html, "html.parser") soup_tmp
print(soup_tmp.find('dl', 'prodcutInfo'))
print(soup_tmp.find('div', 'exist_table'))
print(soup_tmp.find(class_='tbl_tbldiv'))
'NoneType' object has no attribute 'get_text'
You are calling 'get_text' twice. You have put in an error check for one of them but not the other.
for item in list_soup:
name.append(item.find('a').get_text()) #<--- you may also need to check item for not None here
url_add.append(urljoin(url_base, item.find('a')['href']))
if item is not None:
period.append(item.find(class_='period').get_text()) #<--- you are checking item for not None here
strong.append(item.find('strong'))
term.append(item.find(class_='term'))
else:
print("null")
Or, perhaps you are checking the wrong thing. You might need to check item.find(class_='period') is not None rather than just if item is not None.
I'm getting an error for a program that fetches(searches) data from youtube , and it shows an error AttributeError: 'function' object has no attribute 'read' i am on python3
import urllib.request
from bs4 import BeautifulSoup
import sys
flag = 0
textToSearch = 'hello world'
query = sys.argv[0].strip("\"").replace(" ","+")
url = "https://www.youtube.com/results?search_query=" + query
response = urllib.request.urlopen
html = response.read()
soup = BeautifulSoup(html,"lxml")
for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
if ('https://www.youtube.com' + vid['href']).startswith("https://www.youtube.com/watch?v="):
flag = 1
print ('https://www.youtube.com' + vid['href'])
if flag == 0:
print ("No results found") ```
The mistake has been made here:
response = urllib.request.urlopen
html = response.read()
You put urllib.request.urlopen into response variable instead of the result of calling that function.
So instead of
response = urllib.request.urlopen
you should call it with appropriate parameters:
response = urllib.request.urlopen( .. parameters come here ... )
have you tried using requests library ?
like this:
import requests
from bs4 import BeautifulSoup
import sys
flag = 0
textToSearch = 'hello world'
query = sys.argv[0].strip("\"").replace(" ","+")
url = "https://www.youtube.com/results?search_query=" + query
response = requests.get(url)
html = response.content
soup = BeautifulSoup(html,"lxml")
for vid in soup.findAll(attrs={'class':'yt-uix-tile-link'}):
if ('https://www.youtube.com' + vid['href']).startswith("https://www.youtube.com/watch?v="):
flag = 1
print ('https://www.youtube.com' + vid['href'])
if flag == 0:
print ("No results found")
Hey this is my setup: Spyder 3.3.1 / beautifulsoup4 / python 3.6
The below code is from an article on medium (here) about webscraping with python and Beautifulsoup. Was supposed to be a quick read but now TWO days later I still cant not get the code to run in spyder and keep getting:
File "/Users/xxxxxxx/Documents/testdir/swiftScrape.py", line 9, in table_to_df
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
AttributeError: 'NoneType' object has no attribute 'tbody'
Not sure what is going wrong and seems to be an implementation error. Can anyone assist in sheding some light on this issue.
Thanks in advance.
import os
import bs4
import requests
import pandas as pd
PATH = os.path.join("C:\\","Users","xxxxx","Documents","tesdir")
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=None, sep=';', encoding='iso-8859-1')
url = next_page(soup)
counter += 1
Like a lost of example code found on the web, this code is not production-grade code - it blindly assumes that http requests always succeed and returns the expected content. The truth is that it's quite often not the case (network errors, proxies or firewall that blocks you, site down - temporarily or definitely, updates in the site that changed either the urls and/or the page's markup etc).
Your problem manifests itself here:
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.findAll('td')] for row in table.tbody.findAll('tr')])
and comes from table actually being None, which means that here in the for loop:
table = soup.find(name='table', attrs={'id':'tableID'})
there was no "table" tag with id "tableID" found in the html document. You can check this by printing the actual html content:
while True:
print(counter)
page = requests.get(url)
soup = bs4.BeautifulSoup(page.content, 'lxml')
table = soup.find(name='table', attrs={'id':'tableID'})
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continye
# etc
Thanks #bruno desthuilliers for your pointers. Much appreciated.
This is the rewritten code that worked for me using Selenium and webdriver rather than import requests:
import os
import bs4
import pandas as pd
from selenium import webdriver
PATH = os.path.join('/','Users','benmorris','documents','testdir')
def table_to_df(table):
return pd.DataFrame([[td.text for td in row.find_all('td')] for row in soup.find_all('tr')])
def next_page(soup):
return "http:" + soup.find('a', attrs={'rel':'next'}).get('href')
res = pd.DataFrame()
url = "http://bank-code.net/country/FRANCE-%28FR%29/"
counter = 0
driver = webdriver.Chrome()
driver.get(url)
while True:
print(counter)
page = driver.get(url)
soup = bs4.BeautifulSoup(driver.page_source, 'lxml')
table = driver.find_element_by_xpath('//*[#id="tableID"]')
if table is None:
print("no table 'tableID' found for url {}".format(url))
print("html content:\n{}\n".format( page.content))
continue
res = res.append(table_to_df(table))
res.to_csv(os.path.join(PATH,"BIC","table.csv"), index=False, sep=',', encoding='iso-8859-1')
url = next_page(soup)
counter += 1