How to collect all specified href's? - python

In this test model I can collect the href value for the first ('tr', class_='rowLive'), I've tried to create a loop to collect all the others href but it always gives IndentationError: expected an indented block or says I'm trying to use find instead of find_all.
How should I proceed to collect all href?
import requests
from bs4 import BeautifulSoup
url = 'http://sports.williamhill.com/bet/pt/betlive/9'
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
jogos = soup.find_all('tr', class_='rowLive')
jogo = jogos[0]
linksgame = jogo.find('a', href=True).attrs['href'].strip()
print(linksgame)

jogos returns a list, you can loop over it and find() an a for every iteration:
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
for tag in jogos:
print(tag.find("a", href=True)["href"])
Or:
print([tag.find("a", href=True)["href"] for tag in jogos])

Related

Python Request returning different result than original page (browser)

I am trying to do a simple WebScrapper to monitor Nike's site here in Brazil.
Basically i want to track products that have stock right now, to check when new products are added.
My problem is that when i navigate to the site https://www.nike.com.br/snkrs#estoque I see different products compared to what I see using python requests method.
Here is the code I am using:
import requests
from bs4 import BeautifulSoup
headers ={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
url = 'https://www.nike.com.br/snkrs#estoque'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
len(soup.find_all(class_='produto produto--comprar'))
This code gives me 40, but using the browser I can see 56 products https://prnt.sc/26jeo1i
The data comes from a different source, within 3 pages.
import requests
from bs4 import BeautifulSoup
headers ={
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
productList = []
for p in [1,2,3]:
url = f'https://www.nike.com.br/Snkrs/Estoque?p={p}&demanda=true'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
productList += soup.find_all(class_='produto produto--comprar')
Output:
print(len(productList))
56

I can not scrape item from this website. Python

I am trying to scrape all the clothing items in this website but I was not be able to do it. I set 'limit=3' in 'find_all' but it gives me only 1 result. How can I get all result in one request?
Please help me I am stuck with this!
This is the e-commerce website I am trying to scrape
def trendyol():
url = "https://www.trendyol.com/erkek+kazak--hirka?filtreler=22|175"
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
page = requests.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")
list= soup.find_all("div",{"class":"p-card-chldrn-cntnr"}, limit=3)
for div in list:
link= str("https://www.trendyol.com/" + div.a.get("href"))
name = div.find("span",{"class":"prdct-desc-cntnr-name hasRatings"}).text
print(f'link: {link}')
print(f'isim: {name}')
Try this code:
from bs4 import BeautifulSoup
import requests
def trendyol(url):
headers = {"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36'}
page = requests.get(url, headers=headers).text
soup = BeautifulSoup(page, "html.parser")
list= soup.find("div", {'class':'prdct-cntnr-wrppr'})
for link in list.find_all('div',{'class': 'p-card-chldrn-cntnr'}):
print("https://www.trendyol.com" + link.find('a', href=True)['href'])
print(link.find('div',{'class':'image-container'}).img['alt'])
print(link.find('span',{'class':'prdct-desc-cntnr-ttl'}).text)
url = "https://www.trendyol.com/erkek+kazak--hirka?filtreler=22%7C175&pi=3"
trendyol(url)
This code with print product url, title and alt text of title. Thanks.

python web scraping,web parser

I just start learning python, and I have problem with scraping.
Code work correctly, but when I scrape , but get only empty list [].
what I do wrong?
I can't find same problem , Thanks for your time!
`import requests
from bs4 import BeautifulSoup as bs4
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
url = "https://www.worldometers.info/geography/alphabetical-list-of-countries/"
session = requests.session()
try:
req = session.get(url, headers=headers)
if req.status_code == 200:
soup = bs4(req.content, "html.parser")
divs = soup.find_all("div", attrs={"style" : "font-weight"})
name = soup.find_all()
print(divs)
except Exception:
print("ERORR IN URL ADRESS")`
You can get the table with class table-condensed and find the data you need. Please check the below code:
import requests
from bs4 import BeautifulSoup
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
url = "https://www.worldometers.info/geography/alphabetical-list-of-countries/"
session = requests.session()
try:
req = session.get(url, headers=headers)
if req.status_code == 200:
soup = BeautifulSoup(req.content, "html.parser")
countries = soup.find("table", {"class": "table-condensed"}).find("tbody").findAll("tr")
for country in countries:
print(country.findAll("td")[1].text)
except Exception:
print("ERORR IN URL ADRESS")

It returns none when I get the id of the url using beatiful soup and how could i get the content of its id

It returns none when I get the id of the url using Beautiful Soup and how could I get the content of its id
import requests
import json
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Ozeri-Digital-Multifunction-Kitchen-Elegant/dp/B01LAVADW2?pf_rd_p=3e7c8265-9bb7-5ab2-be71-1af95f06a1ad&pf_rd_r=52Z7DNQGKGV31B114R1K&pd_rd_wg=IAKey&ref_=pd_gw_ri&pd_rd_w=rDONb&pd_rd_r=b6b3cf66-c4a8-449a-8676-9027e8922b96'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)
you have created a variable headers, but you didn't add it to your request, also, you are not checking your request response status code (which is 503)
fixing your code it should look something like this:
import requests
import json
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Ozeri-Digital-Multifunction-Kitchen-Elegant/dp/B01LAVADW2?pf_rd_p=3e7c8265-9bb7-5ab2-be71-1af95f06a1ad&pf_rd_r=52Z7DNQGKGV31B114R1K&pd_rd_wg=IAKey&ref_=pd_gw_ri&pd_rd_w=rDONb&pd_rd_r=b6b3cf66-c4a8-449a-8676-9027e8922b96'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
r = requests.get(URL, headers=headers)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
title = soup.find(id="productTitle")
print(title.next)

when I use urllib2 to crawl a wibsite,but without labels ,such as html,body

import urllib2
url = 'http://www.bilibili.com/video/av1669338'
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers={"User-Agent":user_agent}
request=urllib2.Request(url,headers=headers)
response=urllib2.urlopen(request)
text = response.read()
text[:100]
'\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03\xcd}ys\x1bG\xb2\xe7\xdfV\xc4|\x87\x1exhRk\x81\xb8\x08\x10\x90E\xfa\x89\xb2f\x9f\xe3\xd9\xcf\x9e\x1dyb7\xec\tD\x03h\x90\x90p\t\x07)yf"D\xf9I&EI\xd4}\x91\xb6.\xeb\xb0e\x93\x94%Y\xbc$E\xccW\x194\x00\xfe\xe5\xaf\xf0~Y\xd5\xd5\xa8\xeeF\x83\xa7'
import requests
from bs4 import BeautifulSoup
def data():
url = 'http://www.bilibili.com/video/av1669338'
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36"
headers = {"User-Agent": user_agent}
response = requests.get(url, headers=headers)
data = response.content
_html = BeautifulSoup(data)
_meta = _html.head.select('meta[name=keywords]')
print _meta[0]['content']
Try this:
import bs4, requests
res = requests.get("http://www.bilibili.com/video/av1669338")
soup = bs4.BeautifulSoup(res.content, "lxml")
result = soup.find("meta", attrs = {"name":"keywords"}).get("content")
print result

Categories

Resources