BeautifulSoup returns empty list - python

I am new to scraping with python. I am using the BeautifulSoup to extract quotes from a website and here's my code:
#!/usr/bin/python3
from urllib.request import urlopen
from bs4 import BeautifulSoup
r = urlopen("http://quotes.toscrape.com/tag/inspirational/")
bsObj = BeautifulSoup(r, "lxml")
links = bsObj.find_all("div", {"class:" "quote"})
print(links)
It returns:
[]
But when I try this:
for link in links :
print(link)
It returns nothing.
(Note: this happened to me for every website )
Edit: the propose of the code above is just to return a Tag but not the text (the quote)

Related

How do I get all the links from multiple web pages in python?

enter image description here
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
#import re
req = Request("https://www.indiegogo.com/individuals/23489031")
html_page = urlopen(req)
soup = BeautifulSoup(html_page, "lxml")
links = []
for link in soup.findAll('a'):
links.append(link.get('href'))
print(links)
This code works if I use only one url but does not work with multiple urls. how do i do the same if i want to do it with multiple urls?
I haven't used bs4 ever, but you may be able to just create a list containing all the URLs you want to check. Then you can use a loop to iterate and work over each URL seperatly. Like:
urls = ["https://","https://","http://"] #But with actual links
for link in urls:
#Work with each link seperatly here
pass
Here I leave you a small code that I had to do at some point of scraping
you can adapt it to what you want to achieve .. I hope it helps you
import requests
from bs4 import BeautifulSoup as bs
url_list=['https://www.example1.com' , 'https://www.example2.com' ]
def getlinks(url) :
r=requests.get(url)
tags_list=[ a for a in bs(r.text,'html.parser').find_all('a')]
links=[ f'{url.split("//")[0]}//{url.split("//")[1]}{link}' if link.split('/')[0]!='https:' else link for link in [href.attrs['href'] if 'href' in href.attrs else '' for href in tags_list ] ]
return links
you can loop through url_list and execute getlinks(url) with it

How to webscrape a text inside a link in Python?

I would like to webscrape the following page: https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html
In particular, I would like to get the text inside every link you see displayed clicking on the link above. I am able to do it only by clickling on the link. For example, clicking on the first one:
import pandas as pd
from bs4 import BeautifulSoup
import requests
x = "https://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in211222~5f9a709924.en.html"
x1=[requests.get(x)]
x2 = [BeautifulSoup(x1[0].text)]
x3 = [x2[0].select("p+ p") for i in range(len(x2)-1)]
The problem is that I am not able to automate the process that leads me from the url with the list of links containing text (https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html) to the actual link where the text I need is stored (e.g. https://www.ecb.europa.eu/press/inter/date/2021/html/ecb.in211222~5f9a709924.en.html)
Can anyone help me?
Thanks!
To get a list of all links on https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html:
from bs4 import BeautifulSoup
import requests
r = requests.get('https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html')
soup = BeautifulSoup(r.text, 'html.parser')
links = [link.get('href') for link in soup.find_all('a')]
Wouter's answer is correct for getting all links, but if you need just the the title links, you could try a more specific selector query like select('div.title > a'). Here's an example:
from bs4 import BeautifulSoup
import requests
url = "https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html"
html = BeautifulSoup(requests.get(url).text, 'html.parser')
links = html.select('div.title > a')
for link in links:
print(link.attrs['href'])
In particular, I would like to get the text inside every link you see displayed clicking on the link above.
To get the text of every linked article you have to iterate over your list of links and request each of them:
for link in soup.select('div.title > a'):
soup = BeautifulSoup(requests.get(f"https://www.ecb.europa.eu{link['href']}").content)
data.append({
'title':link.text,
'url': url,
'subtitle':soup.main.h2.text,
'text':' '.join([p.text for p in soup.select('main .section p:not([class])')])
})
Example
Contents are stored in a list of dicts, so you can easily access and process the data later.
from bs4 import BeautifulSoup
import requests
url = "https://www.ecb.europa.eu/press/inter/date/2021/html/index_include.en.html"
soup = BeautifulSoup(requests.get(url).content)
data = []
for link in soup.select('div.title > a'):
soup = BeautifulSoup(requests.get(f"https://www.ecb.europa.eu{link['href']}").content)
data.append({
'title':link.text,
'url': url,
'subtitle':soup.main.h2.text,
'text':' '.join([p.text for p in soup.select('main .section p:not([class])')])
})
print(data)

How to scrape main headings of a website using python in colab?

Hi I am a beginner and would like to get the list of all datasets from the website 'https://www.kaggle.com/datasets' based on the filters 'csv' and 'only datasets with tasks'.
I applied the filters and inspected the element. My attempt returns an empty list. This is my code
from urllib.request import urlopen
from bs4 import BeautifulSoup
url = 'https://www.kaggle.com/datasets?sort=usability&fileType=csv&tasks=true'
html = urlopen(url)
soup = BeautifulSoup(response.text, 'lxml')
titles = soup.find_all('li')
print(titles)
Can anyone help?

Trying to scrape Aliexpress

So I am trying to scrape the price of a product on Aliexpress. I tried inspecting the element which looks like
<span class="product-price-value" itemprop="price" data-spm-anchor-id="a2g0o.detail.1000016.i3.fe3c2b54yAsLRn">US $14.43</span>
I'm trying to run the following code
'''
import pandas as pd
from bs4 import BeautifulSoup
from urllib.request import urlopen
import re
url = 'https://www.aliexpress.com/item/32981494236.html?spm=a2g0o.productlist.0.0.44ba26f6M32wxY&algo_pvid=520e41c9-ba26-4aa6-b382-4aa63d014b4b&algo_expid=520e41c9-ba26-4aa6-b382-4aa63d014b4b-22&btsid=0bb0623b16170222520893504e9ae8&ws_ab_test=searchweb0_0,searchweb201602_,searchweb201603_'
source = urlopen(url).read()
soup = BeautifulSoup(source, 'lxml')
soup.find('span', class_='product-price-value')
'''
but I keep getting a blank output. I must be doing something wrong but these methods seem to work in the tutorials I've seen.
So, what i got. As i understood right, the page what you gave, was recived by scripts, but in origin, it doesn't contain it, just script tags, so i just used split to get it. Here is my code:
from bs4 import BeautifulSoup
import requests
url = 'https://aliexpress.ru/item/1005002281350811.html?spm=a2g0o.productlist.0.0.42d53b59T5ddTM&algo_pvid=f3c72fef-c5ab-44b6-902c-d7d362bcf5a5&algo_expid=f3c72fef-c5ab-44b6-902c-d7d362bcf5a5-1&btsid=0b8b035c16170960366785062e33c0&ws_ab_test=searchweb0_0,searchweb201602_,searchweb201603_&sku_id=12000019900010138'
data = requests.get(url)
soup = BeautifulSoup(data.content, features="lxml")
res = soup.findAll("script")
total_value = str(res[-3]).split("totalValue:")[1].split("}")[0].replace("\"", "").replace(".", "").strip()
print(total_value)
It works fine, i tried on few pages from Ali.

Not getting the entire <li> line using BeautifulSoup

I am using BeautifulSoup to extract the list items under the class "secondary-nav-main-links" from the https://www.champlain.edu/current-students web page. I thought my working code below would extract the entire "li" line but the last portion, "/li", is placed on its own line. I included screen captures of the current output and the indended output. Any ideas? Thanks!!
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('https://www.champlain.edu/current-students')
bs = BeautifulSoup(html.read(), 'html.parser')
soup = bs.find(class_='secondary-nav secondary-nav-sm has-callouts')
for div in soup.find_all('li'):
print(div)
Current output:
capture1
Intended output:
capture2
You can remove the newline character with str.replace
And you can unescape html characters like & with html.unescape
str(div).replace('\n','')
To replace & with &, add this to the print statement
import html
html.unescape(str(div))
So your code becomes
from urllib.request import urlopen
from bs4 import BeautifulSoup
import html
html = urlopen('https://www.champlain.edu/current-students')
bs = BeautifulSoup(html.read(), 'html.parser')
soup = bs.find(class_='secondary-nav secondary-nav-sm has-callouts')
for div in soup.find_all('li'):
print(html.unescape(str(div).replace('\n','')))

Categories

Resources