Facebook Scraper - python

I'm trying to scrape Post and images from this facebook profile; https://www.facebook.com/carlostablanteoficial and getting nothing when trying to reach the actual post text with this code:
from urllib.request import urlopen
import requests
from bs4 import BeautifulSoup
html = urlopen("https://www.facebook.com/carlostablanteoficial")
res = BeautifulSoup(html.read(),"html5lib");
resdiv = res.div
post = resdiv.findAll('div', class_='text_exposed_root')
print(post)

This will return many results:
import requests
from bs4 import BeautifulSoup
data = requests.get("https://www.facebook.com/carlostablanteoficial")
soup = BeautifulSoup(data.text, 'html.parser')
for div in soup.find_all('div'):
print(div)
to search for a specific class, change the loop to:
for div in soup.find_all('div', {'class', 'text_exposed_root'}):
print(div)
but when I tried it returned nothing, meaning there is no div with that class on the page

Related

BeautifulSoup find() function returning None because I am not getting correct html info

I am trying to webscrape an Amazon web product page and print out the productTitle. Code below:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import smtplib
def extract():
headers = {my_info}
url = "link"
# this link is to a webpage on Amazon
req = requests.get(url, headers=headers)
soup1 = BeautifulSoup(req.content, "html.parser") # get soup1 from website
soup2 = BeautifulSoup(soup1.prettify(), 'html.parser') # prettify it in soup 2
print(soup2) # when i inspect element on website, it shows that there is a html tag <span> with
id=productTitle
# title = soup2.find('span', id='productTitle').get_text() # find() is returning None
print(soup2.prettify())
I was expecting the html content that i inspected element from directly on the website to be the same as in soup2, but for some reason it's not, which is causing find() to return None. How come the html is not the same? Any help would be appreciated.

Trying to scrape Aliexpress

So I am trying to scrape the price of a product on Aliexpress. I tried inspecting the element which looks like
<span class="product-price-value" itemprop="price" data-spm-anchor-id="a2g0o.detail.1000016.i3.fe3c2b54yAsLRn">US $14.43</span>
I'm trying to run the following code
'''
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
url = 'https://www.aliexpress.com/item/32981494236.html?spm=a2g0o.productlist.0.0.44ba26f6M32wxY&algo_pvid=520e41c9-ba26-4aa6-b382-4aa63d014b4b&algo_expid=520e41c9-ba26-4aa6-b382-4aa63d014b4b-22&btsid=0bb0623b16170222520893504e9ae8&ws_ab_test=searchweb0_0,searchweb201602_,searchweb201603_'
source = urlopen(url).read()
soup = BeautifulSoup(source, 'lxml')
soup.find('span', class_='product-price-value')
'''
but I keep getting a blank output. I must be doing something wrong but these methods seem to work in the tutorials I've seen.
So, what i got. As i understood right, the page what you gave, was recived by scripts, but in origin, it doesn't contain it, just script tags, so i just used split to get it. Here is my code:
from bs4 import BeautifulSoup
import requests
url = 'https://aliexpress.ru/item/1005002281350811.html?spm=a2g0o.productlist.0.0.42d53b59T5ddTM&algo_pvid=f3c72fef-c5ab-44b6-902c-d7d362bcf5a5&algo_expid=f3c72fef-c5ab-44b6-902c-d7d362bcf5a5-1&btsid=0b8b035c16170960366785062e33c0&ws_ab_test=searchweb0_0,searchweb201602_,searchweb201603_&sku_id=12000019900010138'
data = requests.get(url)
soup = BeautifulSoup(data.content, features="lxml")
res = soup.findAll("script")
total_value = str(res[-3]).split("totalValue:")[1].split("}")[0].replace("\"", "").replace(".", "").strip()
print(total_value)
It works fine, i tried on few pages from Ali.

BS4 returns [] instead of the wanted HTML tag

I want to parse the given website and scrape the table. To me the code looks right. New to python and web parsing
import requests
from bs4 import BeautifulSoup
response = requests.get('https://delhifightscorona.in/')
doc = BeautifulSoup(response.text, 'lxml-xml')
cases = doc.find_all('div', {"class": "cell"})
print(cases)
doing this returns
[]
Change your parser and the class and there you have it.
import requests
from bs4 import BeautifulSoup
soup = BeautifulSoup(requests.get('https://delhifightscorona.in/').text, 'html.parser').find('div', {"class": "grid-x grid-padding-x small-up-2"})
print(soup.find("h3").getText())
Output:
423,831
You can choose to print only the cases or the total stats with the date.
import requests
from bs4 import BeautifulSoup
response = requests.get('https://delhifightscorona.in/')
doc = BeautifulSoup(response.text, 'html.parser')
stats = doc.find('div', {"class": "cell medium-5"})
print(stats.text) #Print the whole block with dates and the figures
cases = stats.find('h3')
print(cases.text) #Print the cases only

python BeautifulSoup4 - why does result print 5 times?

I'm trying to do some web scraping in python using BeautifulSoup 4.
I am trying to scrape the salary of an public employee. I am doing that successfully but the result is returned 5 times and I cannot figure out why.
Here is the website I am scraping: https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett
Here is my code example:
import requests
from bs4 import BeautifulSoup
source = requests.get(f'https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett')
soup = BeautifulSoup(source.text, 'html.parser')
main_box = soup.find_all('div')
for i in main_box:
try:
x = i.find('div', class_='col-12 col-lg-4 pay')
z = x.find('h2').text
print(z)
except Exception:
pass
And my results are:
$525,000
$525,000
$525,000
$525,000
$525,000
This is the correct salary, but as I said the results print 5 times.
If I go to the page, right click, and 'inspect' I find the class I am looking for, which is 'col-12 col-lg-4 pay' and then within that the 'h2' tag. There is only one 'h2' tag. And print the text of that.
So it seems I am missing something, but what?
I would just get rid of the for loop and use a more specific find query
import requests
from bs4 import BeautifulSoup
source = requests.get(f'https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett')
soup = BeautifulSoup(source.text, 'html.parser')
main_box = soup.find("div", {"class": "pay"})
print(main_box.find('h2').text)
You can also extract this is by using CSS
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
url = 'https://data.richmond.com/salaries/2018/state/university-of-virginia/tony-bennett'
res = requests.get(url).text
soup = BeautifulSoup(res , 'html.parser')
Value = soup.select('#paytotal')
print(Value[0].text)

Python Beautiful Soup Parse First Href in Each Div

Given the following code:
# import the module
import bs4 as bs
import urllib.request
import re
masterURL = 'http://www.metrolyrics.com/top100.html'
sauce = urllib.request.urlopen(masterURL).read()
soup = bs.BeautifulSoup(sauce,'lxml')
for div in soup.findAll('ul', {'class': 'song-list'}):
for span in div:
for link in span:
for a in link:
print(a)
I can parse multiple divs, and i get a result as follows :
My question is instead of getting the full contents of the div how can I only return the highlighted portion, the URL of the Href?
Try this. You need to specify the right class to fetch the urls connected to it.
from bs4 import BeautifulSoup
import urllib.request
masterURL = 'http://www.metrolyrics.com/top100.html'
sauce = urllib.request.urlopen(masterURL).read()
soup = BeautifulSoup(sauce,'lxml')
for div in soup.find_all(class_='subtitle'):
print(div.get("href"))
Output:
http://www.metrolyrics.com/charles-goose-lyrics.html
http://www.metrolyrics.com/param-singh-lyrics.html
http://www.metrolyrics.com/westlife-lyrics.html
http://www.metrolyrics.com/luis-fonsi-lyrics.html
http://www.metrolyrics.com/grease-lyrics.html
http://www.metrolyrics.com/shanti-dope-lyrics.html
and so on ---
if 'href' in a.attrs:
a.attrs['href']
this will give you what you need.

Categories

Resources