BeautifulSoup find() takes no keyword arguments error - python

from bs4 import BeautifulSoup
from selenium import webdriver
import time
import sys
query_txt = input("크롤링할 내용 입력 :")
path = "C:\Temp\chromedriver_240\chromedriver.exe"
driver = webdriver.Chrome(path)
driver.get("https://www.naver.com")
time.sleep(2)
driver.find_element_by_id("query").send_keys(query_txt)
driver.find_element_by_id("search_btn").click()
driver.find_element_by_link_text("블로그 더보기").click()
full_html = driver.page_source
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.find('ul', id='elThumbnailResultArea')
print(content_list)
content = content_list.find('a','sh_blog_title _sp_each_url _sp_each_title' ).get_text()
print(content)
for i in content_list:
con = i.find('a', class_='sh_blog_title _sp_each_url _sp_each_title').get_text()
print(con)
print('\n')
i typed this code with watching online learning but in loop it always error.
con = i.find('a', class_='sh_blog_title _sp_each_url _sp_each_title').get_text()
this line show error 'find() takes no keyword arguments'

The problem is, you have to use .find_all() to get all <a> tags. .find() only returns one tag (if there's any):
import requests
from bs4 import BeautifulSoup
url = 'https://search.naver.com/search.naver?query=tree&where=post&sm=tab_nmr&nso='
full_html = requests.get(url).content
soup = BeautifulSoup(full_html, 'html.parser')
content_list = soup.find_all('a', class_='sh_blog_title _sp_each_url _sp_each_title' )
for i in content_list:
print(i.text)
print('\n')
Prints:
[2017/공학설계 입문] Romantic Tree
장충동/Banyan Tree Club & Spa/Club Members Restaurant
2020-06-27 Joshua Tree National Park Camping(조슈아트리...
[결혼준비/D-102] 웨딩밴드 '누니주얼리 - like a tree'
Book Club - Magic Tree House # 1 : Dinosaur Before Dark...
비밀 정원, 조슈아 트리 국립공원(Joshua Tree National Park)
그뤼너씨 TEA TREE 티트리 라인 3종리뷰
Number of Nodes in the Sub-Tree With the Same Label
태국의 100년 넘은 Giant tree
[부산 기장 카페] 오션뷰 뷰맛집카페 : 씨앤트리 sea&tree

use .find('a', attrs={"class": "<Class name>"}) instead. Reference: Beatifulsoup docs

These two links will definitely help you.
Understand the Find() function in Beautiful Soup
Find on beautiful soup in loop returns TypeError

Related

How to extract text from 'a' element with BeautifulSoup?

I'm trying to get the text from a 'a' html element I got with beautifulsoup.
I am able to print the whole thing and what I want to find is right there:
-1
Tensei Shitara Slime Datta Ken Manga
-1
But when I want to be more specific and get the text from that it gives me this error:
File "C:\python\manga\manga.py", line 15, in <module>
print(title.text)
AttributeError: 'int' object has no attribute 'text'
Here is the code I'm running:
import requests
from bs4 import BeautifulSoup
URL = 'https://mangapark.net/manga/tensei-shitara-slime-datta-ken-fuse'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find('section', class_='manga')
manga_title = soup.find('div', class_='pb-1 mb-2 line-b-f hd')
for m_title in manga_title:
title = m_title.find('a')
print(title.text)
I've searched for my problem but I couldn't find something that helps.
Beautiful soup returns -1 as a value when it doesn't find something in a search
This isn't a very common way in python to show that no values exist but it is a common one for other languages.
import requests
from bs4 import BeautifulSoup
URL = 'https://mangapark.net/manga/tensei-shitara-slime-datta-ken-fuse'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find('section', class_='manga')
manga_title = soup.find('div', class_='pb-1 mb-2 line-b-f hd')
for m_title in manga_title.children:
title = m_title.find('a')
# Beautiful soup returns -1 as a value when it doesn't find something in a search
# This isn't a very pythonic way to show non existent values but it is a common one
if title != -1:
print(title.text)
Output
Tensei Shitara Slime Datta Ken Manga

Web Scraping find not moving on to next item

from bs4 import BeautifulSoup
import requests
def kijiji():
source = requests.get('https://www.kijiji.ca/b-mens-shoes/markham-york-region/c15117001l1700274').text
soup = BeautifulSoup(source,'lxml')
b = soup.find('div', class_='price')
for link in soup.find_all('a',class_ = 'title'):
a = link.get('href')
fulllink = 'http://kijiji.ca'+a
print(fulllink)
b = soup.find('div', class_='price')
print(b.prettify())
kijiji()
Usage of this is to sum up all the different kinds of items sold in kijiji and pair them up with a price.
But I can't seem to find anyway to increment what beautiful soup is finding with a class of price, and I'm stuck with the first price. Find_all doesn't work either as it just prints out the whole blob instead of grouping it together with each item.
If you have Beautiful soup 4.7.1 or above you can use following css selector select() which is much faster.
code:
import requests
from bs4 import BeautifulSoup
res=requests.get("https://www.kijiji.ca/b-mens-shoes/markham-york-region/c15117001l1700274").text
soup=BeautifulSoup(res,'html.parser')
for item in soup.select('.info-container'):
fulllink = 'http://kijiji.ca' + item.find_next('a', class_='title')['href']
print(fulllink)
price=item.select_one('.price').text.strip()
print(price)
Or to use find_all() use below code block
import requests
from bs4 import BeautifulSoup
res=requests.get("https://www.kijiji.ca/b-mens-shoes/markham-york-region/c15117001l1700274").text
soup=BeautifulSoup(res,'html.parser')
for item in soup.find_all('div',class_='info-container'):
fulllink = 'http://kijiji.ca' + item.find_next('a', class_='title')['href']
print(fulllink)
price=item.find_next(class_='price').text.strip()
print(price)
Congratulations on finding the answer. I'll give you another solution for reference only.
import requests
from simplified_scrapy.simplified_doc import SimplifiedDoc
def kijiji():
url = 'https://www.kijiji.ca/b-mens-shoes/markham-york-region/c15117001l1700274'
source = requests.get(url).text
doc = SimplifiedDoc(source)
infos = doc.getElements('div',attr='class',value='info-container')
for info in infos:
price = info.select('div.price>text()')
a = info.select('a.title')
link = doc.absoluteUrl(url,a.href)
title = a.text
print (price)
print (link)
print (title)
kijiji()
Result:
$310.00
https://www.kijiji.ca/v-mens-shoes/markham-york-region/jordan-4-oreo-2015/1485391828
Jordan 4 Oreo (2015)
$560.00
https://www.kijiji.ca/v-mens-shoes/markham-york-region/yeezy-boost-350-yecheil-reflectives/1486296645
Yeezy Boost 350 Yecheil Reflectives
...
Here are more examples:https://github.com/yiyedata/simplified-scrapy-demo/tree/master/doc_examples
from bs4 import BeautifulSoup
import requests
def kijiji():
source = requests.get('https://www.kijiji.ca/b-mens-shoes/markham-york-region/c15117001l1700274').text
soup = BeautifulSoup(source,'lxml')
b = soup.find('div', class_='price')
for link in soup.find_all('a',class_ = 'title'):
a = link.get('href')
fulllink = 'http://kijiji.ca'+a
print(fulllink)
print(b.prettify())
b = b.find_next('div', class_='price')
kijiji()
Was stuck on this for an hour, as soon as I posted this on stack I immediately came up with an idea, messy code but works!

Scraping the news (Python 3.6, BeautifulSoup)

I want to scrape spiegel.de/schlagzeilen to get all the news-stuff which is shown below the dates (today, yesterday, to days ago).
<div class="schlagzeilen-content schlagzeilen-overview">
contains what I want, I think, but there is one problem left:
print(data)
keeps out the data I need, but in addition it comes with a bunch of phrases I don't want (like names of the integrated modules/ parts of HTML/CSS and so on)
So I chose
for item in data:
print(item.text)
This one has a very pretty output(!), but now I miss the article-URL, which is important to have. Is there anybody who can help me out? Here is my code:
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
data = soup.find_all("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for item in data:
print(item.text)
You could use CSS selector to find all article links:
from bs4 import BeautifulSoup
import requests
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
# data = soup.find_all("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
links = soup.select('div.schlagzeilen-content a')
for item in links:
print item.text, website + item['href']
Some output:
Bayern: Sechs Tote in Gartenlaube - keine Hinweise auf Gewaltverbrechen http://spiegel.de/schlagzeilen/panorama/justiz/tote-in-gartenlaube-keine-hinweise-auf-gewaltverbrechen-a-1132268.html
Starbucks, Tesla, GE: Trumps Einreiseverbot beunruhigt US-Konzerne http://spiegel.de/schlagzeilen/wirtschaft/soziales/donald-trump-und-das-einreiseverbot-us-konzerne-zeigen-sich-besorgt-a-1132262.html
...
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
website = 'http://spiegel.de/schlagzeilen'
r = requests.get(website)
soup = BeautifulSoup((r.content), "lxml")
div = soup.find("div", {"class": "schlagzeilen-content schlagzeilen-overview"})
for a in div.find_all('a', title=True):
link = urljoin(website, a.get('href'))
print(a.text, a.find_next_sibling('span').text)
print(link)
out:
Südafrika: Dutzende Patienten sterben nach Verlegung (Panorama, 13:09)
http://spiegel.de/panorama/gesellschaft/suedafrika-verlegung-in-privatkliniken-dutzende-patienten-gestorben-a-1132677.html
Trumps Stotterstart: Ein Präsident, so unbeliebt wie keiner zuvor (Politik, 12:59)
http://spiegel.de/politik/ausland/donald-trump-als-us-praesident-so-unbeliebt-wie-kein-vorgaenger-a-1132554.html
Kontrolle von Gefährdern: Kabinett beschließt elektronische Fußfessel (Politik, 12:33)
The tag you need is a tag and it's sibling span which contains (Netzwelt, 12:23), so just use find_all and use a tag as an anchor.
And if you want the full path of the url, use urljoin

Scraping Indeed with Beautiful Soup

I'm unfamiliar with html and web scraping with beautiful soup. I'm trying to retrieve Job titles, salaries, location and company name from various indeed job postings. This is my code so far:
URL = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"
import urllib2
import bs4
from bs4 import BeautifulSoup
soup = BeautifulSoup(urllib2.urlopen(URL).read())
resultcol = soup.find_all(id = 'resultsCol')
company = soup.findAll('span', attrs={"class":"company"})
jobs = (soup.find_all({'class': " row result"}))
though I have the commands to find jobs and company, I can't get the contents. I'm aware there's a contents command, but none of my variables so far have that attribute. Thanks!
First I seach div with one job all elements and then I search elements inside this div
import urllib2
from bs4 import BeautifulSoup
URL = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"
soup = BeautifulSoup(urllib2.urlopen(URL).read(), 'html.parser')
results = soup.find_all('div', attrs={'data-tn-component': 'organicJob'})
for x in results:
company = x.find('span', attrs={"itemprop":"name"})
print 'company:', company.text.strip()
job = x.find('a', attrs={'data-tn-element': "jobTitle"})
print 'job:', job.text.strip()
salary = x.find('nobr')
if salary:
print 'salary:', salary.text.strip()
print '----------'
updated #furas example, for python3:
import urllib.request
from bs4 import BeautifulSoup
URL = "https://www.indeed.com/jobs?q=data+scientist+%2420%2C000&l=New+York&start=10"
soup = BeautifulSoup(urllib.request.urlopen(URL).read(), 'html.parser')
results = soup.find_all('div', attrs={'data-tn-component': 'organicJob'})
for x in results:
company = x.find('span', attrs={"class":"company"})
if company:
print('company:', company.text.strip() )
job = x.find('a', attrs={'data-tn-element': "jobTitle"})
if job:
print('job:', job.text.strip())
salary = x.find('nobr')
if salary:
print('salary:', salary.text.strip())
print ('----------')

Scraping data from href

I was trying to get the postcodes for DFS, for that i tried getting the href for each shop and then click on it, the next page has shop location from which i can get the postal code, but i am able to get things working, Where am i going wrong?
I tried getting upper level attribute first td.searchResults and then for each of them i am trying to click on href with title DFS and after clicking getting the postalCode. Eventually iterate for all three pages.
If there is a better way to do it let me know.
driver = webdriver.Firefox()
driver.get('http://www.localstore.co.uk/stores/75061/dfs/')
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('td.searchResults')
for l in listings:
while True:
driver.find_element_by_css_selector("a[title*='DFS']").click()
shops= {}
#info = soup.find('span', itemprop='postalCode').contents
html = driver.page_source
soup = BeautifulSoup(html)
info = soup.find(itemprop="postalCode").get_text()
shops.append(info)
Update:
driver = webdriver.Firefox()
driver.get('http://www.localstore.co.uk/stores/75061/dfs/')
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('td.searchResults')
for l in listings:
driver.find_element_by_css_selector("a[title*='DFS']").click()
shops = []
html = driver.page_source
soup = BeautifulSoup(html)
info = soup.find_all('span', attrs={"itemprop": "postalCode"})
for m in info:
if m:
m_text = m.get_text()
shops.append(m_text)
print (shops)
So after playing with this for a little while, I don't think the best way to do this is with selenium. It would require using driver.back() and waiting for elements to re-appear, and a whole mess of other stuff. I was able to get what you want using just requests, re and bs4. re is included in the Python standard library and if you haven't installed requests, you can do it with pip as follows: pip install requests
from bs4 import BeautifulSoup
import re
import requests
base_url = 'http://www.localstore.co.uk'
url = 'http://www.localstore.co.uk/stores/75061/dfs/'
res = requests.get(url)
soup = BeautifulSoup(res.text)
shops = []
links = soup.find_all('a', href=re.compile('.*\/store\/.*'))
for l in links:
full_link = base_url + l['href']
town = l['title'].split(',')[1].strip()
res = requests.get(full_link)
soup = BeautifulSoup(res.text)
info = soup.find('span', attrs={"itemprop": "postalCode"})
postalcode = info.text
shops.append(dict(town_name=town, postal_code=postalcode))
print shops
Your code has some problems. You are using an infinite loop without breaking condition. Also shops= {} is a dict but you are using append method on it.
Instead of using selenium you can use python-requests or urllib2.
But In your code you can do something like this,
driver = webdriver.Firefox()
driver.get('http://www.localstore.co.uk/stores/75061/dfs/')
html = driver.page_source
soup = BeautifulSoup(html)
listings = soup.select('td.searchResults')
for l in listings:
driver.find_element_by_css_selector("a[title*='DFS']").click()
shops = []
html = driver.page_source
soup = BeautifulSoup(html)
info = soup.find('span', attrs={"itemprop": "postalCode"})
if info:
info_text = info.get_text()
shops.append(info_text)
print shops
In Beautifulsoup you can find a tag by it's attribute like this:
soup.find('span', attrs={"itemprop": "postalCode"})
also if it doesn't find anything, it will return None and .get_text() method on it will raise AttributeError. So check first before applying .get_text()

Categories

Resources