I'm trying to scrap some links from a site but I'm running into an issue where my for loop will stop at the first link.
Currently What I have
import requests
import lxml
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
fighter_links = soup.find('td', {
'class': 'b-statistics__table-col'
}).find_all('a')
fighterLinks = []
for anchor in fighter_links:
# urls = anchor['href']
fighterLinks.append(anchor['href'])
print(fighterLinks)
When I print I'm getting
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9']
Site I'm trying to pull from
when you do
fighter_links = soup.find('td', {'class': 'b-statistics__table-col'}).find_all('a')
you are only getting the first table record. soup.find will only return the first match that it finds. what you need to do is change it to
fighter_links = soup.find_all('td', {'class': 'b-statistics__table-col'})
fighterLinks = []
that will get you all the table enteries that match your class name, and from there you need to do loop to extract out the links
for link in fighter_links:
if(link.find('a')):
fighterLinks.append(link.find('a').get('href'))
I don't know if this will help, but I hope it does:
import requests
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
aa = soup.select("a.b-link_style_black")
fighterLinks = []
for i in aa:
for k in i:
fighterLinks.append(aa[aa.index(i)].attrs["href"])
print(fighterLinks)
outputs:
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a']
Requests will fail on some connections in this instance. Better use cloudscraper: (pip install cloudscraper)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
soup = BeautifulSoup(scraper.get("http://ufcstats.com/statistics/fighters?char=a").text)
links = soup.select_one('.b-statistics__table').select('a')
print(set([x.get('href') for x in links]))
This returns:
{'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/1c5879330d42255f'}
Hey guess so I got as far as being able to add the a class to a list. The problem is I just want the href link to be added to the links_with_text list and not the entire a class. What am I doing wrong?
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'hnmain')
articles = results.find_all(class_="title")
links_with_text = []
for article in articles:
link = article.find('a', href=True)
links_with_text.append(link)
print('\n'.join(map(str, links_with_text)))
This prints exactly how I want the list to print but I just want the href from every a class not the entire a class. Thank you
To get all links from the https://news.ycombinator.com, you can use CSS selector 'a.storylink'.
For example:
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
links_with_text = []
for a in soup.select('a.storylink'): # <-- find all <a> with class="storylink"
links_with_text.append(a['href']) # <-- note the ['href']
print(*links_with_text, sep='\n')
Prints:
https://blog.mozilla.org/futurereleases/2020/06/18/introducing-firefox-private-network-vpns-official-product-the-mozilla-vpn/
https://mxb.dev/blog/the-return-of-the-90s-web/
https://github.blog/2020-06-18-introducing-github-super-linter-one-linter-to-rule-them-all/
https://www.sciencemag.org/news/2018/11/why-536-was-worst-year-be-alive
https://www.strongtowns.org/journal/2020/6/16/do-the-math-small-projects
https://devblogs.nvidia.com/announcing-cuda-on-windows-subsystem-for-linux-2/
https://lwn.net/SubscriberLink/822568/61d29096a4012e06/
https://imil.net/blog/posts/2020/fakecracker-netbsd-as-a-function-based-microvm/
https://jepsen.io/consistency
https://tumblr.beesbuzz.biz/post/621010836277837824/advice-to-young-web-developers
https://archive.org/search.php?query=subject%3A%22The+Navy+Electricity+and+Electronics+Training+Series%22&sort=publicdate
https://googleprojectzero.blogspot.com/2020/06/ff-sandbox-escape-cve-2020-12388.html?m=1
https://apnews.com/1da061ce00eb531291b143ace0eed1c9
https://support.apple.com/library/content/dam/edam/applecare/images/en_US/appleid/android-apple-music-account-payment-none.jpg
https://standpointmag.co.uk/issues/may-june-2020/the-healing-power-of-birdsong/
https://steveblank.com/2020/06/18/the-coming-chip-wars-of-the-21st-century/
https://www.videolan.org/security/sb-vlc3011.html
https://onesignal.com/careers/2023b71d-2f44-4934-a33c-647855816903
https://www.bbc.com/news/world-europe-53006790
https://github.com/efficient/HOPE
https://everytwoyears.org/
https://www.historytoday.com/archive/natural-histories/intelligence-earthworms
https://cr.yp.to/2005-590/powerpc-cwg.pdf
https://quantum.country/
http://www.crystallography.net/cod/
https://parkinsonsnewstoday.com/2020/06/17/tiny-magnetically-powered-implant-may-be-future-of-deep-brain-stimulation/
https://spark.apache.org/releases/spark-release-3-0-0.html
https://arxiv.org/abs/1712.09624
https://www.washingtonpost.com/technology/2020/06/18/data-privacy-law-sherrod-brown/
https://blog.chromium.org/2020/06/improving-chromiums-browser.html
https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king
I want to get TOMATOMETER and AUDIENCE SCORE from that website,
but got an empty list.
soup = BeautifulSoup(html, 'html.parser')
notices = soup.select('#tomato_meter_link > span.mop-ratings-wrap__percentage')
You can use last-child selector for span type with the parent class. This is using BeautifulSoup 4.7.1
import requests
from bs4 import BeautifulSoup
res = requests.get('https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king')
soup = bs(res.content, 'lxml')
ratings = [item.text.strip() for item in soup.select('h1.mop-ratings-wrap__score span:last-child')]
print(ratings)
Your code works well
>>> from bs4 import BeautifulSoup
>>> html = requests.get('https://www.rottentomatoes.com/m/the_lord_of_the_rings_the_return_of_the_king').text
>>> soup = BeautifulSoup(html, 'html.parser')
>>> notices = soup.select('#tomato_meter_link > span.mop-ratings-wrap__percentage')
>>> notices
[<span class="mop-ratings-wrap__percentage">93%</span>]
How did you get html variable?
My example
from bs4 import BeautifulSoup
import requests
result = requests.get("https://pythonprogramming.net/parsememcparseface/")
c = result.content
soup = BeautifulSoup(c,'lxml')
patch_name = soup.find_all(["a", "p"])
u = soup.get_text()
print(u)
How do I get the text I need for I can store it in a variable for later usage.
this will return a list of a and p tag:
patch_name = soup.find_all(["a", "p"])
you can get all the text of the list :
[tag.get_text() for tag in patch_name]
My problem is when parsing a website and then loading the data tree with BS. How can I look for the content of an <em> Tag? I tried
for first in soup.find_all("li", class_="li-in"):
print first.select("em.fl.in-date").string
#or
print first.select("em.fl.in-date").contents
but it doesnt work. Pls help.
I am searching for cars on tutti.ch
Here is my entire code:
#Crawl tutti.ch
import urllib
thisurl = "http://www.tutti.ch/stgallen/fahrzeuge/autos"
handle = urllib.urlopen(thisurl)
html_gunk = handle.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_gunk, 'html.parser')
for first in soup.find_all("li", class_="li-in"):
if first.a.string and "Audi" and "BMW" in first.a.string:
print "Geschafft: %s" % first.a.contents
print first.select("em.fl.in-date").string
else:
print first.a.contents
When it finds a bmw or audi it should check for when the car was inserted. The time is located in an em-Tag like this:
<em class="fl in-date">
Heute
<br></br>
13:59
</em>
first.select("em.fl.in-date").text
Assuming your selector is correct. You didn't provide which URL you're scraping, so I can't be sure.
>>> url = "http://stackoverflow.com/questions/38187213/python-beautifulsoup"
>>> from bs4 import BeautifulSoup
>>> import urllib2
>>> html = urllib2.urlopen(url).read()
>>> soup = BeautifulSoup(html)
>>> soup.find_all("p")[0].text
u'My problem is when parsing a website and then loading the data tree with BS. How can I look for the content of an <em> Tag? I tried '
After seeing your code, I made the following change, take a look:
#Crawl tutti.ch
import urllib
thisurl = "http://www.tutti.ch/stgallen/fahrzeuge/autos"
handle = urllib.urlopen(thisurl)
html_gunk = handle.read()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_gunk, 'html.parser')
for first in soup.find_all("li", class_="li-in"):
if first.a.string and "Audi" and "BMW" in first.a.string:
print "Geschafft: %s" % first.a.contents
print first.select("em.fl.in-date")[0].text
else:
print first.a.contents