Terminal won't show print response using BeautifulSoup - python

Here is my code:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anZ1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
name_list = soup.find(class_='BodyText')
name_list_item = name_list.find_all('a')
for i in name_list_item:
names = name_list.contents[0]
print(names)
Then I ran it but nothing showed up in terminal except for a blank space like this:
Please help!! :<

the problem is in the for loop, you have to exctract content from i and not from name_list_item.
your working code should look like this:
import requests
from bs4 import BeautifulSoup
page = requests.get('https://web.archive.org/web/20121007172955/https://www.nga.gov/collection/anZ1.htm')
soup = BeautifulSoup(page.text, 'html.parser')
name_list = soup.find(class_='BodyText')
name_list_item = name_list.find_all('a')
for i in name_list_item:
names = i.contents[0]
print(names)

I will suggest you to use the below approach to get the links.
(Actually the problem with your appraoch is that it also includes invalid data that we don't want, you can print and check). There are 32 names of type <class 'bs4.element.NavigableString'> which does not have contents, so it is printing 32 LF (ASCII value 10) characters.
Useful links ยป
How to find tags with only certain attributes - BeautifulSoup
How to find children of nodes using Beautiful Soup
Python: BeautifulSoup extract text from anchor tag
>>> import requests
>>> from bs4 import BeautifulSoup
>>>
>>> page = requests.get('https://web.archive.org/web/20121007172955/https://www
.nga.gov/collection/anZ1.htm')
>>>
>>> soup = BeautifulSoup(page.text, 'html.parser')
>>> name_list = soup.findAll("tr", {"valign": "top"})
>>>
>>> for name in name_list:
... print(name.find("a")["href"])
...
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=11630
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=34202
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3475
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=25135
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=2298
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=23988
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=8232
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=34154
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=4910
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3450
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=1986
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3451
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=20099
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3452
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=34309
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=27191
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=5846
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3941
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3941
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3453
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=35173
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=11133
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3455
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3454
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=961
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=11597
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=11597
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=11631
/web/20121007172955/https://www.nga.gov/cgi-bin/tsearch?artistid=3427
>>>
Thank you.

Related

How to not print empty line?

I'm trying to scrap some links from a site but I'm running into an issue where my for loop will stop at the first link.
Currently What I have
import requests
import lxml
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
fighter_links = soup.find('td', {
'class': 'b-statistics__table-col'
}).find_all('a')
fighterLinks = []
for anchor in fighter_links:
# urls = anchor['href']
fighterLinks.append(anchor['href'])
print(fighterLinks)
When I print I'm getting
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9']
Site I'm trying to pull from
when you do
fighter_links = soup.find('td', {'class': 'b-statistics__table-col'}).find_all('a')
you are only getting the first table record. soup.find will only return the first match that it finds. what you need to do is change it to
fighter_links = soup.find_all('td', {'class': 'b-statistics__table-col'})
fighterLinks = []
that will get you all the table enteries that match your class name, and from there you need to do loop to extract out the links
for link in fighter_links:
if(link.find('a')):
fighterLinks.append(link.find('a').get('href'))
I don't know if this will help, but I hope it does:
import requests
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
aa = soup.select("a.b-link_style_black")
fighterLinks = []
for i in aa:
for k in i:
fighterLinks.append(aa[aa.index(i)].attrs["href"])
print(fighterLinks)
outputs:
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a']
Requests will fail on some connections in this instance. Better use cloudscraper: (pip install cloudscraper)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
soup = BeautifulSoup(scraper.get("http://ufcstats.com/statistics/fighters?char=a").text)
links = soup.select_one('.b-statistics__table').select('a')
print(set([x.get('href') for x in links]))
This returns:
{'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/1c5879330d42255f'}

How to get just links of articles in list using BeautifulSoup

Hey guess so I got as far as being able to add the a class to a list. The problem is I just want the href link to be added to the links_with_text list and not the entire a class. What am I doing wrong?
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'hnmain')
articles = results.find_all(class_="title")
links_with_text = []
for article in articles:
link = article.find('a', href=True)
links_with_text.append(link)
print('\n'.join(map(str, links_with_text)))
This prints exactly how I want the list to print but I just want the href from every a class not the entire a class. Thank you
To get all links from the https://news.ycombinator.com, you can use CSS selector 'a.storylink'.
For example:
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
links_with_text = []
for a in soup.select('a.storylink'): # <-- find all <a> with class="storylink"
links_with_text.append(a['href']) # <-- note the ['href']
print(*links_with_text, sep='\n')
Prints:
https://blog.mozilla.org/futurereleases/2020/06/18/introducing-firefox-private-network-vpns-official-product-the-mozilla-vpn/
https://mxb.dev/blog/the-return-of-the-90s-web/
https://github.blog/2020-06-18-introducing-github-super-linter-one-linter-to-rule-them-all/
https://www.sciencemag.org/news/2018/11/why-536-was-worst-year-be-alive
https://www.strongtowns.org/journal/2020/6/16/do-the-math-small-projects
https://devblogs.nvidia.com/announcing-cuda-on-windows-subsystem-for-linux-2/
https://lwn.net/SubscriberLink/822568/61d29096a4012e06/
https://imil.net/blog/posts/2020/fakecracker-netbsd-as-a-function-based-microvm/
https://jepsen.io/consistency
https://tumblr.beesbuzz.biz/post/621010836277837824/advice-to-young-web-developers
https://archive.org/search.php?query=subject%3A%22The+Navy+Electricity+and+Electronics+Training+Series%22&sort=publicdate
https://googleprojectzero.blogspot.com/2020/06/ff-sandbox-escape-cve-2020-12388.html?m=1
https://apnews.com/1da061ce00eb531291b143ace0eed1c9
https://support.apple.com/library/content/dam/edam/applecare/images/en_US/appleid/android-apple-music-account-payment-none.jpg
https://standpointmag.co.uk/issues/may-june-2020/the-healing-power-of-birdsong/
https://steveblank.com/2020/06/18/the-coming-chip-wars-of-the-21st-century/
https://www.videolan.org/security/sb-vlc3011.html
https://onesignal.com/careers/2023b71d-2f44-4934-a33c-647855816903
https://www.bbc.com/news/world-europe-53006790
https://github.com/efficient/HOPE
https://everytwoyears.org/
https://www.historytoday.com/archive/natural-histories/intelligence-earthworms
https://cr.yp.to/2005-590/powerpc-cwg.pdf
https://quantum.country/
http://www.crystallography.net/cod/
https://parkinsonsnewstoday.com/2020/06/17/tiny-magnetically-powered-implant-may-be-future-of-deep-brain-stimulation/
https://spark.apache.org/releases/spark-release-3-0-0.html
https://arxiv.org/abs/1712.09624
https://www.washingtonpost.com/technology/2020/06/18/data-privacy-law-sherrod-brown/
https://blog.chromium.org/2020/06/improving-chromiums-browser.html

Got empty list with Beautiful Soup and Selenium

https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king
I want to get TOMATOMETER and AUDIENCE SCORE from that website,
but got an empty list.
soup = BeautifulSoup(html, 'html.parser')
notices = soup.select('#tomato_meter_link > span.mop-ratings-wrap__percentage')
You can use last-child selector for span type with the parent class. This is using BeautifulSoup 4.7.1
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king')
soup = bs(res.content, 'lxml')
ratings = [item.text.strip() for item in soup.select('h1.mop-ratings-wrap__score span:last-child')]
print(ratings)
Your code works well
>>> from bs4 import BeautifulSoup
>>> html = requests.get('https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king').text
>>> soup = BeautifulSoup(html, 'html.parser')
>>> notices = soup.select('#tomato_meter_link > span.mop-ratings-wrap__percentage')
>>> notices
[<span class="mop-ratings-wrap__percentage">93%</span>]
How did you get html variable?

After finding all the links and texts using find_all in Beautiful Soup how do grab the one that you need

My example
from bs4 import BeautifulSoup
import requests
result = requests.get("https://pythonprogramming.net/parsememcparseface/")
c = result.content
soup = BeautifulSoup(c,'lxml')
patch_name = soup.find_all(["a", "p"])
u = soup.get_text()
print(u)
How do I get the text I need for I can store it in a variable for later usage.
this will return a list of a and p tag:
patch_name = soup.find_all(["a", "p"])
you can get all the text of the list :
[tag.get_text() for tag in patch_name]

Python BeautifulSoup can't select specific tag

My problem is when parsing a website and then loading the data tree with BS. How can I look for the content of an <em> Tag? I tried
for first in soup.find_all("li", class_="li-in"):
print first.select("em.fl.in-date").string
#or
print first.select("em.fl.in-date").contents
but it doesnt work. Pls help.
I am searching for cars on tutti.ch
Here is my entire code:
#Crawl tutti.ch
import urllib
thisurl = "http://www.tutti.ch/stgallen/fahrzeuge/autos"
handle = urllib.urlopen(thisurl)
html_gunk = handle.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_gunk, 'html.parser')
for first in soup.find_all("li", class_="li-in"):
if first.a.string and "Audi" and "BMW" in first.a.string:
print "Geschafft: %s" % first.a.contents
print first.select("em.fl.in-date").string
else:
print first.a.contents
When it finds a bmw or audi it should check for when the car was inserted. The time is located in an em-Tag like this:
<em class="fl in-date">
Heute
<br></br>
13:59
</em>
first.select("em.fl.in-date").text
Assuming your selector is correct. You didn't provide which URL you're scraping, so I can't be sure.
>>> url = "http://stackoverflow.com/questions/38187213/python-beautifulsoup"
>>> from bs4 import BeautifulSoup
>>> import urllib2
>>> html = urllib2.urlopen(url).read()
>>> soup = BeautifulSoup(html)
>>> soup.find_all("p")[0].text
u'My problem is when parsing a website and then loading the data tree with BS. How can I look for the content of an <em> Tag? I tried '
After seeing your code, I made the following change, take a look:
#Crawl tutti.ch
import urllib
thisurl = "http://www.tutti.ch/stgallen/fahrzeuge/autos"
handle = urllib.urlopen(thisurl)
html_gunk = handle.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_gunk, 'html.parser')
for first in soup.find_all("li", class_="li-in"):
if first.a.string and "Audi" and "BMW" in first.a.string:
print "Geschafft: %s" % first.a.contents
print first.select("em.fl.in-date")[0].text
else:
print first.a.contents

Categories

Resources