python web scraping,web parser

python web scraping,web parser - python

I just start learning python, and I have problem with scraping.
Code work correctly, but when I scrape , but get only empty list [].
what I do wrong?
I can't find same problem , Thanks for your time!
`import requests
from bs4 import BeautifulSoup as bs4
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
url = "https://www.worldometers.info/geography/alphabetical-list-of-countries/"
session = requests.session()
try:
req = session.get(url, headers=headers)
if req.status_code == 200:
soup = bs4(req.content, "html.parser")
divs = soup.find_all("div", attrs={"style" : "font-weight"})
name = soup.find_all()
print(divs)
except Exception:
print("ERORR IN URL ADRESS")`

You can get the table with class table-condensed and find the data you need. Please check the below code:
import requests
from bs4 import BeautifulSoup
headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
url = "https://www.worldometers.info/geography/alphabetical-list-of-countries/"
session = requests.session()
try:
req = session.get(url, headers=headers)
if req.status_code == 200:
soup = BeautifulSoup(req.content, "html.parser")
countries = soup.find("table", {"class": "table-condensed"}).find("tbody").findAll("tr")
for country in countries:
print(country.findAll("td")[1].text)
except Exception:
print("ERORR IN URL ADRESS")

Related

How to collect all specified href's?

In this test model I can collect the href value for the first ('tr', class_='rowLive'), I've tried to create a loop to collect all the others href but it always gives IndentationError: expected an indented block or says I'm trying to use find instead of find_all.
How should I proceed to collect all href?
import requests
from bs4 import BeautifulSoup
url = 'http://sports.williamhill.com/bet/pt/betlive/9'
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, 'html.parser')
jogos = soup.find_all('tr', class_='rowLive')
jogo = jogos[0]
linksgame = jogo.find('a', href=True).attrs['href'].strip()
print(linksgame)

jogos returns a list, you can loop over it and find() an a for every iteration:
import requests
from bs4 import BeautifulSoup
url = "http://sports.williamhill.com/bet/pt/betlive/9"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
site = requests.get(url, headers=headers)
soup = BeautifulSoup(site.content, "html.parser")
jogos = soup.find_all("tr", class_="rowLive")
for tag in jogos:
print(tag.find("a", href=True)["href"])
Or:
print([tag.find("a", href=True)["href"] for tag in jogos])

BeautifulSoup is returning empty data from website

I am running this code
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'
}
r = requests.get('https://www.bohus.no/spiseplassen/oppbevaring-1/gradino-vitrine-2')
soup = BeautifulSoup(r.content, 'lxml')
print(soup.find('div', class_='price').text)
I am trying to get the price of the product on this site: https://www.bohus.no/spiseplassen/oppbevaring-1/gradino-vitrine-2
All I am getting is empty data when running my code. Am I doing something wrong or does the website do something special to stop me from scaping price?

As stated in the comments, you can get the product data from the store's API.
Here's how:
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/78.0.3904.108 Safari/537.36',
"x-requested-with": "XMLHttpRequest"
}
product_url = "https://www.bohus.no/spiseplassen/oppbevaring-1/gradino-vitrine-2"
page_content = requests.get(product_url).content
soup = BeautifulSoup(page_content, 'lxml')
product_id = soup.find("input", {"name": "d-session-product"})["value"]
payload = {
"debug": "off",
"ajax": "1",
"product_list": product_id,
"action": "init",
"showStockStatusAndShoppingcart": "1",
"enablePickupAtNearbyStores": "yes",
}
endpoint = "https://www.bohus.no/lite.cgi/module/priceAndStock"
product_data = requests.post(endpoint, data=payload, headers=headers).json()
print(product_data["price"][0]["salesPriceNormal"])
Output:
8799

Also, you can use requests-HTML library:
import requests_html
from requests_html import HTMLSession
session = HTMLSession()
r = session.get('https://www.bohus.no/spiseplassen/oppbevaring-1/gradino-vitrine-2')
r.html.render()
sel='div.price-data > div.price'
print(r.html.find(sel, first=True).text)

How to get text paragraphs from a website: Error 403 Forbidden

I am trying to do web scraping with the help of requests and BeautifulSoup. But, the desired outcome is null.
My code is as follows:
def urlscrape(url):
page = requests.get(url).text
soup = BeautifulSoup(page, 'html')
text = [p.text for p in soup.find(class_='bg-white').find_all('p')]
print(url)
return text
The website is: https://www.afghanistan-analysts.org/en/reports/war-and-peace/taleban-prisoners-release-are-the-latest-proposals-legal/
I want all the <p> tags containing paragraphs to be extracted as texts.

you can try this:-
from bs4 import BeautifulSoup
import requests
import pandas as pd
headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36"
}
url = 'https://www.afghanistan-analysts.org/en/reports/war-and-peace/taleban-prisoners-release-are-the-latest-proposals-legal/'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.text, 'html')
text = [p.text for p in soup.find_all('p')]

Try this ...
url="https://www.afghanistan-analysts.org/en/reports/war-and-peace/taleban-prisoners-release-are-the-latest-proposals-legal/"
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'
headers = {'User-Agent': user_agent}
response = requests.get(url,headers=headers)
html = response.content
print(response.content)

It returns none when I get the id of the url using beatiful soup and how could i get the content of its id

It returns none when I get the id of the url using Beautiful Soup and how could I get the content of its id
import requests
import json
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Ozeri-Digital-Multifunction-Kitchen-Elegant/dp/B01LAVADW2?pf_rd_p=3e7c8265-9bb7-5ab2-be71-1af95f06a1ad&pf_rd_r=52Z7DNQGKGV31B114R1K&pd_rd_wg=IAKey&ref_=pd_gw_ri&pd_rd_w=rDONb&pd_rd_r=b6b3cf66-c4a8-449a-8676-9027e8922b96'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
title = soup.find(id="productTitle")
print(title)

you have created a variable headers, but you didn't add it to your request, also, you are not checking your request response status code (which is 503)
fixing your code it should look something like this:
import requests
import json
from bs4 import BeautifulSoup
URL = 'https://www.amazon.com/Ozeri-Digital-Multifunction-Kitchen-Elegant/dp/B01LAVADW2?pf_rd_p=3e7c8265-9bb7-5ab2-be71-1af95f06a1ad&pf_rd_r=52Z7DNQGKGV31B114R1K&pd_rd_wg=IAKey&ref_=pd_gw_ri&pd_rd_w=rDONb&pd_rd_r=b6b3cf66-c4a8-449a-8676-9027e8922b96'
headers = {"User-Agent":'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'}
r = requests.get(URL, headers=headers)
if r.status_code == 200:
soup = BeautifulSoup(r.text)
title = soup.find(id="productTitle")
print(title.next)

Getting 200 response but not logging in using python requests library

I have tried to scrape twitter account followers list. For that, authentication is required. So i used requests library for authentication purpose. The problem i am getting is, when i try to authenticate, I am getting 200 response but authentication is not done. The code is:
import requests
from bs4 import BeautifulSoup
import json
payload={
"session[username_or_email]":"*****************",
"session[password]":"**********************",
"authenticity_token":"aa3520020157738bdabb6d60f2e02894c6c85689",
"ui_metrics":'{"rf":{"a67dd0828000993f688a64a8238f647dd8ef987feb0db5979725fc7e304c3989":-250,"a4cd98aa5fd1d026bfded286fc24eb6ac9cf01a65b789ade51b68558cb0f6ae0":-21,"a88c7b5bdeb04ce3cf55df08c0f981f99df760b9348680c735fbff5b60ad054f":51,"a5e59c69fb04ab30f2f8468030c31ca1150f4265e4c2a35dbb1b67b85be6954f":-68},"s":"QdcvZJ9RhjLcVcW2N_pDt5j5AKQJCkqnh9caYV5ykW35tRpQc_RN5s_VefN2uVCONpXf-qZa-fr8VtCAFrtiOf2f6PhloU2GyxLDN38wGppFNWhb4psCr7x-kibioS9PDxWZF1pe3FM-MOz9YtIQrWxbmEAWnRTK3gUn-1nv4kTFDa159YxJoXiYt43g41sRUJWezJI2yJaECnO1ARbkNAPKrMndxRAcq_5qSFpT8CqzEUvBKPMdFMKeUrzeEecqmx632lTV1NlucVIvV9co3Y3Rk7CtURoaiCwsjTED1brU4XAY3VwsTEuNRUYZqirRNZrYQBCHqsMh5FV_UHpO2QAAAWE40pmN"}',
"scribe_log":"",
"redirect_after_login":"",
"authenticity_token":"aa3520020157738bdabb6d60f2e02894c6c85689",
"return_to_ssl":"",
"remember_me":"1",
"lang":"",
"redirect":""
}
headers={
"accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"accept-encoding":"gzip, deflate, br",
"accept-language":"en-US,en;q=0.9",
"cache-control":"max-age=0",
"cookie":'moments_profile_moments_nav_tooltip_self=true; syndication_guest_id=v1%3A150345116906281638; eu_cn=1; kdt=QErLcBT9OjM5gjEznmsRcHlMTK6biDyAw4gfI5ro; _ga=GA1.2.1923324433.1496571570; tfw_exp=0; __utma=43838368.1923324433.1496571570.1516764481.1516764481.1; __utmz=43838368.1516764481.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); remember_checked_on=0; personalization_id="v1_Iq7dc3Mq746/e91mchhhJg=="; guest_id=v1%3A151698504007256847; ads_prefs="HBERAAA="; _twitter_sess=BAh7CSIKZmxhc2hJQzonQWN0aW9uQ29udHJvbGxlcjo6Rmxhc2g6OkZsYXNo%250ASGFzaHsABjoKQHVzZWR7ADoPY3JlYXRlZF9hdGwrCF5%252F0jhhAToMY3NyZl9p%250AZCIlN2ZmZjExM2NkYjUzODEzZDNiNDE4YWI3NGRhZTAxOTc6B2lkIiU3YWFl%250AZjVhNDY1OWJlNzdiN2RiYjEzNjIwYWVjMGMyMQ%253D%253D--d69792331ec3a3b6c9d994a07f2159bfd5697089; ct0=ecc095f3a61b1c77279538584cb6f20e; _gid=GA1.2.253357133.1517076775; _gat=1',
"referer":"https://twitter.com/login",
"upgrade-insecure-requests":"1",
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36"
}
str=payload["ui_metrics"]
x=json.dumps(str)
y=json.loads(str)
payload["ui_metrics"]=y
res = requests.post("https://twitter.com/login",data=payload,headers=headers)
r = requests.get("https://twitter.com/following")
soup = BeautifulSoup(r.text,"html.parser")
print(res.status_code)
print(r.url)
print(soup.prettify())
for item in soup.find_all({"class":"u-textInheritColor js-nav"}):
print(item.text)
I am getting 200 response for status code. How to solve this problem?
NOTE: I am not using any APIs. I want to authenticate using requests library.

Try this. It should get you there:
import requests
from bs4 import BeautifulSoup
with requests.Session() as s:
r = s.get("https://twitter.com/login")
soup = BeautifulSoup(r.text,"lxml")
token = soup.select_one("[name='authenticity_token']")['value']
payload={
'session[username_or_email]':'your_email',
'session[password]':'your_password',
'authenticity_token':token,
'ui_metrics':'{"rf":{"c6fc1daac14ef08ff96ef7aa26f8642a197bfaad9c65746a6592d55075ef01af":3,"a77e6e7ab2880be27e81075edd6cac9c0b749cc266e1cea17ffc9670a9698252":-1,"ad3dbab6c68043a1127defab5b7d37e45d17f56a6997186b3a08a27544b606e8":252,"ac2624a3b325d64286579b4a61dd242539a755a5a7fa508c44eb1c373257d569":-125},"s":"fTQyo6c8mP7d6L8Og_iS8ulzPObBOzl3Jxa2jRwmtbOBJSk4v8ClmBbF9njbZHRLZx0mTAUPsImZ4OnbZV95f-2gD6-03SZZ8buYdTDkwV-xItDu5lBVCQ_EAiv3F5EuTpVl7F52FTIykWowpNIzowvh_bhCM0_6ReTGj6990294mIKUFM_mPHCyZxkIUAtC3dVeYPXff92alrVFdrncrO8VnJHOlm9gnSwTLcbHvvpvC0rvtwapSbTja-cGxhxBdekFhcoFo8edCBiMB9pip-VoquZ-ddbQEbpuzE7xBhyk759yQyN4NmRFwdIjjedWYtFyOiy_XtGLp6zKvMjF8QAAAWE468LY"}',
'scribe_log':'',
'redirect_after_login':'',
'authenticity_token':token,
'remember_me':1
}
headers={
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'content-type':'application/x-www-form-urlencoded',
'origin':'https://twitter.com',
'referer':'https://twitter.com/login',
'upgrade-insecure-requests':'1',
'user-agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
res = s.post("https://twitter.com/sessions",data=payload,headers=headers)
soup = BeautifulSoup(res.text,"lxml")
for item in soup.select(".tweet-text"):
print(item.text)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

python web scraping,web parser - python

Related

How to collect all specified href's?

BeautifulSoup is returning empty data from website

How to get text paragraphs from a website: Error 403 Forbidden

It returns none when I get the id of the url using beatiful soup and how could i get the content of its id

Getting 200 response but not logging in using python requests library

Categories

Resources