I'm trying to select an element that looks like <li class="result first_result"> using the query soup.find_all("li", {"class" : "first_result"})
The element is definitely on the page but it's not showing up when I run my script. I've also tried soup.find_all("li", {"class" : "result first_result"}) for the record, but still nothing.
What am I doing wrong?
edit: at alecxe's request I've posted the code I have so far. I'm on 64-bit Windows 7 using Python 3.4 which I'm sure is the culprit. The specific part I made this question for is at the very bottom under ###METACRITIC STUFF###
from bs4 import BeautifulSoup
from urllib3 import poolmanager
import csv
import requests
import sys
import os
import codecs
import re
import html5lib
import math
import time
from random import randint
connectBuilder = poolmanager.PoolManager()
inputstring = sys.argv[1] #argv string MUST use double quotes
inputarray = re.split('\s+',inputstring)
##########################KAT STUFF########################
katstring = ""
for item in inputarray: katstring += (item + "+")
katstring=katstring[:-1]
#kataddress = "https://kat.cr/usearch/?q=" + katstring #ALL kat
kataddress = "https://kat.cr/usearch/" + inputstring + " category:tv/?field=seeders&sorder=desc" #JUST TV kat
#print(kataddress)
numSeedsArray = []
numLeechArray = []
r = requests.get(kataddress)
soup = BeautifulSoup(r.content, "html5lib")
totalpages = [h2.find('span') for h2 in soup.findAll('h2')][0].text #get a string that looks like 'house of cards results 1-25 from 178'
totalpages = int(totalpages[-4:]) #slice off everything but the total # of pages
totalpages = math.floor(totalpages/25)
#print("totalpages= "+str(totalpages))
iteration=0
savedpage = ""
def getdata(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
global numSeedsArray
global numLeechArray
tds = soup.findAll("td", { "class" : "green center" })
numSeedsArray += [int(td.text) for td in tds]
tds = soup.findAll("td", { "class" : "red lasttd center"})
numLeechArray += [int(td.text) for td in tds]
#print(numSeedsArray)
def getnextpage(url):
global iteration
global savedpage
#print("url examined= "+url)
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
nextpagelinks = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton" })
nextpagelinks = [link.get('href') for link in nextpagelinks]
#print(nextpagelinks)
activepage = soup.findAll("a", { "class" : "turnoverButton siteButton bigButton active" })
#print("activepage= " +activepage[0].text)
currentpagenum = activepage[0].text
#print("currentpagenum= "+currentpagenum)
if len(currentpagenum)==1 and iteration>1:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==1 and iteration<=1:
nextpage = str(nextpagelinks[0][:-28])+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
savedpage = str(nextpagelinks[0][:-28])
#print("savedpage= "+savedpage )
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
elif len(currentpagenum)==2:
nextpage = savedpage+str(int(currentpagenum)+1)+str(nextpagelinks[0][-27:])
#print("nextpage= "+nextpage)
nextpage = re.sub(r'(%20)', ' ', nextpage)
nextpage = re.sub(r'(%3A)', ':', nextpage)
nextpage = "https://kat.cr"+nextpage
#print(nextpage)
return nextpage
if totalpages<2:
while iteration < totalpages-1: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
else:
while iteration < 2: #should be totalpages-1 for max accuracy
getdata(kataddress)
iteration+=1
kataddress = getnextpage(kataddress)
# print(str(sum(numSeedsArray)))
# print(str(sum(numLeechArray)))
print(str(sum(numLeechArray)+sum(numSeedsArray)))
def getgoogdata(title):
title = re.sub(r' ', '+', title)
url = 'https://www.google.com/search?q=' +title+ '&ie=utf-8&oe=utf-8'
r = requests.get(url)
soup = BeautifulSoup(r.content, "html5lib")
resultnum = soup.find("div", {"id": "resultStats"}).text[:-14]
s2 = resultnum.replace(',', '')
resultnum = re.findall(r'\b\d+\b', s2)
print(resultnum)
getgoogdata(inputstring)
####################METACRITIC STUFF#########################
metainputstring = ""
for item in inputarray:
metainputstring += item + " "
metainputstring = metainputstring[:-1]
metacriticaddress = "http://www.metacritic.com/search/tv/" + metainputstring + "/results"
print (metacriticaddress)
r = requests.get(metacriticaddress)
soup = BeautifulSoup(r.content, "html5lib")
first_result = soup.find_all("li", attrs={"class" : "first_result"})
# first_result = soup.select("li.result.first_result")
print(first_result)
All other answers are not related to your actual problem.
You need to pretend to be a real browser to be able to see the search results:
r = requests.get(metacriticaddress, headers={
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
})
Proof (searching for Game of Thrones, of course):
>>> from bs4 import BeautifulSoup
>>>
>>> import requests
>>>
>>> metacriticaddress = "http://www.metacritic.com/search/tv/game%20of%20thrones/results"
>>> r = requests.get(metacriticaddress, headers={
... "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"
... })
>>> soup = BeautifulSoup(r.content, "html5lib")
>>> first_result = soup.find_all("li", class_="first_result")
>>>
>>> print(first_result[0].find("h3", class_="product_title").get_text(strip=True))
Game of Thrones
Quoting the documentation:
It’s very useful to search for a tag that has a certain CSS class, but the name of the CSS attribute, “class”, is a reserved word in Python. Using class as a keyword argument will give you a syntax error. As of Beautiful Soup 4.1.2, you can search by CSS class using the keyword argument class_
Thus, you need to instead write: soup.find_all("li", class_="first_result").
If you're using a pre-4.1.2 version of BeautifulSoup, or if you're insistent upon passing a dictionary, you need to specify that the dictionary fills the attrs parameter: soup.find_all("li", attrs={"class" : "first_result"}).
Your first attempt (soup.find_all("li", {"class" : "first_result"})) is almost correct, but you need to specify the parameter that your dictionary is being passed to (in this case the parameter name is attrs), and call it like soup.find_all("li", attrs={"class" : "first_result"}).
However, I would suggest doing this with a CSS selector because you're matching against multiple classes. You can do this using the .select() method of the soup like this
results = soup.select("li.result.first_result")
Be aware that .select() will always return a list, so if there's only one element, don't forget to access it as results[0].
Related
I'm just learning python. I want to improve myself with examples. sorry for my English. I'm in the process of learning a new language. :)
The program pulls data from an e-commerce site.
when I want to save it as a csv file, each new data overwrites the previous data. I tried several examples but it didn't work.
Thanks for your help.
import requests
import gettext
from bs4 import BeautifulSoup
import pandas as pd
import openpyxl as xls
import xlsxwriter`
baseurl = "https://www.trendyol.com"
headers = {'User-Agent':
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.106 Safari/537.36 OPR/38.0.2220.41'
}
for x in range(1,62):
r = requests.get(f'https://www.trendyol.com/cep-telefonu-x-c103498?pi={x}', headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
productlist = soup.find_all('div', class_='p-card-wrppr')
for item in productlist:
productname = item.find('span', class_='prdct-desc-cntnr-name').getText()
productprice_old = item.find('div', class_='prc-box-sllng').getText()
productprice_discount = item.find('div', class_='prc-box-dscntd')
for productlink in item.find_all('a'):
productlink = baseurl+productlink.get('href')
if productprice_discount == None:
productprice_discount = productprice_old
else:
productprice_discount = productprice_discount.getText()
for merchant_name in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
merchant_name = soup.find('a', class_='merchant-text')
if merchant_name == None:
merchant_name = soup.find('a', class_='title')
if merchant_name == None:
merchant_name = soup.find('span', class_='product-description-market-place')
if merchant_name == None:
merchant_name = ('NULL')
else:
merchant_name = merchant_name.getText()
break
for product_image in productlink:
r = requests.get(productlink, headers=headers)
soup = BeautifulSoup(r.content, 'lxml')
product_image = soup.find_all('img', attrs={'class':'detail-section-img'})
image_src = [x['src'] for x in product_image]
image_src = [x for x in image_src if x.endswith('.jpg' or '.png')]
break
data = [ [productname,productlink,productprice_old,productprice_discount,merchant_name,image_src] ]
df = pd.DataFrame(data, columns = ["Product Name", "URL", "Price", "D-Price", "Store", "Image Url"])
df.to_csv('trendyol3.csv')
You should add mode='a', which means append to append to file instead of rewriting:
df.to_csv('trendyol3.csv', mode='a')
I need to get text "Платонов А.П." Here's my code by far.
import requests
from bs4 import BeautifulSoup
from pip._internal.network.utils import HEADERS
URL = "https://www.moscowbooks.ru/books/?sortby=name&sortdown=false"
HEADERS = {"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.3 Safari/605.1.15", "accept" : "*/*"}
HOST = "https://www.moscowbooks.ru"
def get_html(url, params=None):
r = requests.get(url, headers = HEADERS, params = params)
return r
def get_content(html):
soup = BeautifulSoup(html, "html.parser")
items = soup.find_all("div", class_ = "catalog__item")
books = []
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.get_text()
else:
author_check = "Автор не указан"
books.append({
"title": item.find("div", class_ = "book-preview__title").get_text(strip=True),
"author": author_check,
"link": HOST + item.find("a", class_ = "book-preview__title-link").get("href"),
"cost": item.find("div", class_="book-preview__price").get_text(strip=True),
})
print(books)
print(len(books))
def parse():
html = get_html(URL)
if html.status_code == 200:
get_content(html.text)
else:
print("Error")
parse()
I get problems with author, because it get like this:
<a class="author-name" href="/catalog/author/21381/">Платонов А. П. </a>
Also need a little help with price because sometimes it gets '2\xa0274' instead of '2 274'.
The problem is that you define "author": author_check in your dictionary, while author_check = item.find("a", class_="author-name") and author = author_check.get_text(). You can change your for loop into something like this
for item in items:
author_check = item.find("a", class_="author-name")
if author_check:
author = author_check.text
else:
author = "Автор не указан"
For you issue with the display of the prices, you can just replace \xa0 with a comma or space.
"cost": item.find("div", class_="book-preview__price").get_text(strip=True).replace(u"\xa0", ",")
I've had to deal with similar problem. You can do the following:
author = author_check.get_text().split('>')[-2].split('<')[0]
You might have to substitute -2 with -1.
I am using bs4 to write a webscraper to obtain funding news data.
The first part of my code extracts the title, link, summary and date
of each article for n number of pages.
The second part of my code loops through the link column and inputs
the resulting url in a new function, which extracts the url of the
company in question.
For the most part, the code works fine (40 pages scraped without errors). I am trying to stress test it by raising it to 80 pages, but i'm running into KeyError: 'href' and I don't know how to fix this.
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from tqdm import tqdm
def clean_data(column):
df[column]= df[column].str.encode('ascii', 'ignore').str.decode('ascii')
#extract
def extract(page):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
url = f'https://www.uktechnews.info/category/investment-round/series-a/page/{page}/'
r = requests.get(url, headers)
soup = BeautifulSoup(r.content, 'html.parser')
return soup
#transform
def transform(soup):
for item in soup.find_all('div', class_ = 'post-block-style'):
title = item.find('h3', {'class': 'post-title'}).text.replace('\n','')
link = item.find('a')['href']
summary = item.find('p').text
date = item.find('span', {'class': 'post-meta-date'}).text.replace('\n','')
news = {
'title': title,
'link': link,
'summary': summary,
'date': date
}
newslist.append(news)
return
newslist = []
#subpage
def extract_subpage(url):
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Safari/605.1.15'}
r = requests.get(url, headers)
soup_subpage = BeautifulSoup(r.text, 'html.parser')
return soup_subpage
def transform_subpage(soup_subpage):
main_data = soup_subpage.select("div.entry-content.clearfix > p > a")
if len(main_data):
subpage_link = {
'subpage_link': main_data[0]['href']
}
subpage.append(subpage_link)
else:
subpage_link = {
'subpage_link': '--'
}
subpage.append(subpage_link)
return
subpage = []
#load
page = np.arange(0, 80, 1).tolist()
for page in tqdm(page):
try:
c = extract(page)
transform(c)
except:
None
df1 = pd.DataFrame(newslist)
for url in tqdm(df1['link']):
t = extract_subpage(url)
transform_subpage(t)
df2 = pd.DataFrame(subpage)
Here is a screenshot of the error:
Screenshot
I think the issue is that my if statement for the transform_subpage function does not account for instances where main_data is not an empty list but does not contain href links. I am relatively new to Python so any help would be much appreciated!
You are correct, it's caused by main_data[0] not having an 'href' attribute at some point. You can try changing the logic to something like:
def transform_subpage(soup_subpage):
main_data = soup_subpage.select("div.entry-content.clearfix > p > a")
if len(main_data):
if 'href' in main_data[0].attrs:
subpage_link = {
'subpage_link': main_data[0]['href']
}
subpage.append(subpage_link)
else:
subpage_link = {
'subpage_link': '--'
}
subpage.append(subpage_link)
Also just a note, it's probably not a great idea to iterate through a variable list, and use the same variable name for each item in the list. So change to something like:
page_list = np.arange(0, 80, 1).tolist()
for page in tqdm(page_list):
I want to scrape few pages from amazon website like title,url,aisn and i run into a problem that script only parsing 15 products while on the page it is showing 50. i decided to print out all html to console and i saw that the html is ending at 15 products without any errors from the script.
Here is the part of my script
keyword = "men jeans".replace(' ', '+')
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords={}".format(keyword)
request = requests.session()
req = request.get(url, headers = headers)
sleep(3)
soup = BeautifulSoup(req.content, 'html.parser')
print(soup)
It's because few of the items are generated dynamically. There might be any better solution other than using selenium. However, as a workaround you can try the below way instead.
from selenium import webdriver
from bs4 import BeautifulSoup
def fetch_item(driver,keyword):
driver.get(url.format(keyword.replace(" ", "+")))
soup = BeautifulSoup(driver.page_source, 'html.parser')
for items in soup.select("[id^='result_']"):
try:
name = items.select_one("h2").text
except AttributeError: name = ""
print(name)
if __name__ == '__main__':
url = "https://www.amazon.com/s/field-keywords={}"
driver = webdriver.Chrome()
try:
fetch_item(driver,"men jeans")
finally:
driver.quit()
Upon running the above script you should get 56 names or something as result.
import requests
from bs4 import BeautifulSoup
for page in range(1, 21):
keyword = "red car".replace(' ', '+')
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.5; en-US; rv:1.9.1b3) Gecko/20090305 Firefox/3.1b3 GTB5'}
url = "https://www.amazon.com/s/field-keywords=" + keyword + "?page=" + str(page)
request = requests.session()
req = request.get(url, headers=headers)
soup = BeautifulSoup(req.content, 'html.parser')
results = soup.findAll("li", {"class": "s-result-item"})
for i in results:
try:
print(i.find("h2", {"class": "s-access-title"}).text.replace('[SPONSORED]', ''))
print(i.find("span", {"class": "sx-price-large"}).text.replace("\n", ' '))
print('*' * 20)
except:
pass
Amazon's page range is max till 20 here is it crawling the pages
So I have this code that will give me the urls I need in a list format
import requests
from bs4 import BeautifulSoup
offset = 0
links = []
with requests.Session() as session:
while True:
r = session.get("http://rayleighev.deviantart.com/gallery/44021661/Reddit?offset=%d" % offset)
soup = BeautifulSoup(r.content, "html.parser")
new_links = soup.find_all("a", {'class' : "thumb"})
# no more links - break the loop
if not new_links:
break
# denotes the number of gallery pages gone through at one time (# of pages times 24 equals the number below)
links.extend(new_links)
print(len(links))
offset += 24
#denotes the number of gallery pages(# of pages times 24 equals the number below)
if offset == 48:
break
for link in links:
print(link.get("href"))
After that I try to get different text from all of the urls, and all that text is in relatively the same place on each one. But, whenever I run the second half, below, I keep getting a chunk of html text and some errors, and I'm not sure of how to fix it or if there is any other, and preferably simpler, way to get the text from each url.
import urllib.request
import re
for link in links:
url = print("%s" % link)
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
paragraphs = re.findall(r'</a><br /><br />(.*?)</div>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span>', str(respData))
if paragraphs != None:
paragraphs = re.findall(r'<br /><br />(.*?)</span></div>', str(respData))
for eachP in paragraphs:
print(eachP)
title = re.findall(r'<title>(.*?)</title>', str(respData))
for eachT in title:
print(eachT)
Your code:
for link in links:
url = print("%s" % link)
assigns None to url. Perhaps you mean:
for link in links:
url = "%s" % link.get("href")
There's also no reason to use urllib to get the sites content, you can use requests as you did before by changing:
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = resp.read()
to
req = requests.get(url, headers=headers)
soup = BeautifulSoup(req.content, "html.parser")
Now you can get the title and paragraph with just:
title = soup.find('div', {'class': 'dev-title-container'}).h1.text
paragraph = soup.find('div', {'class': 'text block'}).text