How to scrape text from div with empty class value - python

Hi how to scrape text from div without any class? At first I try to scrape all data from div with class 'jobs page' then without class value but it doesn't work.
from bs4 import BeautifulSoup
import requests
a = {}
def antal_pl(name=''):
try:
page_response = requests.get('https://antal.pl/oferty-pracy?s=&sid=&did=Accountancy', timeout=40).text
page_content = BeautifulSoup(page_response, 'lxml')
data = page_content.find_all(class_ = 'jobs_page')
data_in = data.find_all('div', class_ = None)
print(data_in)
except:
''
antal_pl( name='Accontancy')

Try the below approach to get the text out of that webpage as you mentioned above. I've tried to organize your code a little to make it look cleaner.
from bs4 import BeautifulSoup
import requests
URL = "https://antal.pl/oferty-pracy?s=&sid=&did={}"
def antal_pl(name):
res = requests.get(URL.format(name))
soup = BeautifulSoup(res.text, 'lxml')
data = soup.find(class_='header').find_next_sibling().text.strip()
print(data)
if __name__ == '__main__':
antal_pl("Accountancy")
Result:
Znaleziono 47 ofert pracy.

use XPATH
html = etree.HTML(wb_data)
html_data = html.xpath('/html/body/div/ul/li/a')
enter image description here

Related

Python web scraping script does not find element by css selector

I'm trying to get this web scraper to get current electricity price from this website, it's in finnish but it's right under "Hinta nyt". https://sahko.tk/
Here's my code:
import requests
from bs4 import BeautifulSoup
url = "https://sahko.tk/"
element_selector = ""
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
elements = soup.find_all(element_selector)
if len(elements) == 0:
print("No element found with selector '%s'" % element_selector)
else:
element_text = elements[0].text
print(element_text)
I left the element_selector to empty because what ever I tried it just did not work. I'm not even sure if I'm on the right tracks.
The data you see is embedded inside <script> in that page. To parse the current price you can use next example:
import re
import json
import requests
url = "https://sahko.tk/"
data = requests.get(url).text
data = re.search(r"function prices_today\(\)\{var t= (.*?});", data).group(1)
data = json.loads(data)
print("Hinta nyt", data["now"], "snt/kWh")
Prints:
Hinta nyt 33.27 snt/kWh

How to not print empty line?

I'm trying to scrap some links from a site but I'm running into an issue where my for loop will stop at the first link.
Currently What I have
import requests
import lxml
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
fighter_links = soup.find('td', {
'class': 'b-statistics__table-col'
}).find_all('a')
fighterLinks = []
for anchor in fighter_links:
# urls = anchor['href']
fighterLinks.append(anchor['href'])
print(fighterLinks)
When I print I'm getting
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9']
Site I'm trying to pull from
when you do
fighter_links = soup.find('td', {'class': 'b-statistics__table-col'}).find_all('a')
you are only getting the first table record. soup.find will only return the first match that it finds. what you need to do is change it to
fighter_links = soup.find_all('td', {'class': 'b-statistics__table-col'})
fighterLinks = []
that will get you all the table enteries that match your class name, and from there you need to do loop to extract out the links
for link in fighter_links:
if(link.find('a')):
fighterLinks.append(link.find('a').get('href'))
I don't know if this will help, but I hope it does:
import requests
from bs4 import BeautifulSoup
url = 'http://ufcstats.com/statistics/fighters?char=a'
f = requests.get(url)
soup = BeautifulSoup(f.content, 'lxml')
aa = soup.select("a.b-link_style_black")
fighterLinks = []
for i in aa:
for k in i:
fighterLinks.append(aa[aa.index(i)].attrs["href"])
print(fighterLinks)
outputs:
['http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/1c5879330d42255f', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a']
Requests will fail on some connections in this instance. Better use cloudscraper: (pip install cloudscraper)
import cloudscraper
from bs4 import BeautifulSoup
scraper = cloudscraper.create_scraper()
soup = BeautifulSoup(scraper.get("http://ufcstats.com/statistics/fighters?char=a").text)
links = soup.select_one('.b-statistics__table').select('a')
print(set([x.get('href') for x in links]))
This returns:
{'http://ufcstats.com/fighter-details/a08ddd04eaffd81d', 'http://ufcstats.com/fighter-details/15df64c02b6b0fde', 'http://ufcstats.com/fighter-details/a77633a989013265', 'http://ufcstats.com/fighter-details/0e9869d712e81f8f', 'http://ufcstats.com/fighter-details/1338e2c7480bdf9e', 'http://ufcstats.com/fighter-details/3329d692aea4dc28', 'http://ufcstats.com/fighter-details/79cb2a690b9ba5e8', 'http://ufcstats.com/fighter-details/44aa652b181bcf68', 'http://ufcstats.com/fighter-details/c0ed7b208197e8de', 'http://ufcstats.com/fighter-details/b361180739bed4b0', 'http://ufcstats.com/fighter-details/59a9d6dac61c2540', 'http://ufcstats.com/fighter-details/ebc5af72ad5a28cb', 'http://ufcstats.com/fighter-details/501821d7fb7b95c1', 'http://ufcstats.com/fighter-details/989b85f6540c86b1', 'http://ufcstats.com/fighter-details/7279654c7674cd24', 'http://ufcstats.com/fighter-details/aa6e591c2a2cdecd', 'http://ufcstats.com/fighter-details/8f382b3baa954d2a', 'http://ufcstats.com/fighter-details/2f5cbecbbe18bac4', 'http://ufcstats.com/fighter-details/93fe7332d16c6ad9', 'http://ufcstats.com/fighter-details/c9f6385af6df66d7', 'http://ufcstats.com/fighter-details/2620f3eb21c79614', 'http://ufcstats.com/fighter-details/5140122c3eecd307', 'http://ufcstats.com/fighter-details/83b00f7597e5ac83', 'http://ufcstats.com/fighter-details/6cadc0a0ba7dc015', 'http://ufcstats.com/fighter-details/1c5879330d42255f'}

How to get just links of articles in list using BeautifulSoup

Hey guess so I got as far as being able to add the a class to a list. The problem is I just want the href link to be added to the links_with_text list and not the entire a class. What am I doing wrong?
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find(id = 'hnmain')
articles = results.find_all(class_="title")
links_with_text = []
for article in articles:
link = article.find('a', href=True)
links_with_text.append(link)
print('\n'.join(map(str, links_with_text)))
This prints exactly how I want the list to print but I just want the href from every a class not the entire a class. Thank you
To get all links from the https://news.ycombinator.com, you can use CSS selector 'a.storylink'.
For example:
from bs4 import BeautifulSoup
from requests import get
import requests
URL = "https://news.ycombinator.com"
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
links_with_text = []
for a in soup.select('a.storylink'): # <-- find all <a> with class="storylink"
links_with_text.append(a['href']) # <-- note the ['href']
print(*links_with_text, sep='\n')
Prints:
https://blog.mozilla.org/futurereleases/2020/06/18/introducing-firefox-private-network-vpns-official-product-the-mozilla-vpn/
https://mxb.dev/blog/the-return-of-the-90s-web/
https://github.blog/2020-06-18-introducing-github-super-linter-one-linter-to-rule-them-all/
https://www.sciencemag.org/news/2018/11/why-536-was-worst-year-be-alive
https://www.strongtowns.org/journal/2020/6/16/do-the-math-small-projects
https://devblogs.nvidia.com/announcing-cuda-on-windows-subsystem-for-linux-2/
https://lwn.net/SubscriberLink/822568/61d29096a4012e06/
https://imil.net/blog/posts/2020/fakecracker-netbsd-as-a-function-based-microvm/
https://jepsen.io/consistency
https://tumblr.beesbuzz.biz/post/621010836277837824/advice-to-young-web-developers
https://archive.org/search.php?query=subject%3A%22The+Navy+Electricity+and+Electronics+Training+Series%22&sort=publicdate
https://googleprojectzero.blogspot.com/2020/06/ff-sandbox-escape-cve-2020-12388.html?m=1
https://apnews.com/1da061ce00eb531291b143ace0eed1c9
https://support.apple.com/library/content/dam/edam/applecare/images/en_US/appleid/android-apple-music-account-payment-none.jpg
https://standpointmag.co.uk/issues/may-june-2020/the-healing-power-of-birdsong/
https://steveblank.com/2020/06/18/the-coming-chip-wars-of-the-21st-century/
https://www.videolan.org/security/sb-vlc3011.html
https://onesignal.com/careers/2023b71d-2f44-4934-a33c-647855816903
https://www.bbc.com/news/world-europe-53006790
https://github.com/efficient/HOPE
https://everytwoyears.org/
https://www.historytoday.com/archive/natural-histories/intelligence-earthworms
https://cr.yp.to/2005-590/powerpc-cwg.pdf
https://quantum.country/
http://www.crystallography.net/cod/
https://parkinsonsnewstoday.com/2020/06/17/tiny-magnetically-powered-implant-may-be-future-of-deep-brain-stimulation/
https://spark.apache.org/releases/spark-release-3-0-0.html
https://arxiv.org/abs/1712.09624
https://www.washingtonpost.com/technology/2020/06/18/data-privacy-law-sherrod-brown/
https://blog.chromium.org/2020/06/improving-chromiums-browser.html

How do I simultaneously scrape two pages and produce two distinct lists within one nested 'for-loop'?

I'm scraping from two URLs that have the same DOM structure, and so I'm trying to find a way to scrape both of them at the same time.
The only caveat is that the data scraped from both these pages need to end up on distinctly named lists.
To explain with example, here is what I've tried:
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
ws_list = []
ws48_list = []
categories = [ws_list, ws48_list]
for url in urls:
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
for cat_list in categories:
cat_list.append(player_name)
print(ws48_list)
print(ws_list)
This ends up printing two identical lists when I was shooting for 2 lists unique to its page.
How do I accomplish this? Would it be better practice to code it another way?
Instead of trying to append to already existing lists. Just create new ones. Make a function to do the scrape and pass each url in turn to it.
import os
import requests
from bs4 import BeautifulSoup as bs
urls = ['https://www.basketball-reference.com/leaders/ws_career.html',
'https://www.basketball-reference.com/leaders/ws_per_48_career.html',]
def parse_page(url, headers={}):
response = requests.get(url, headers=headers)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
return [a.text for a in section.find_all('a')]
ws_list, ws48_list = [parse_page(url) for url in urls]
print('ws_list = %r' % ws_list)
print('ws8_list = %r' % ws48_list)
Just add them to the appropriate list and the problem is solved?
for i, url in enumerate(urls):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
for a in section.find_all('a'):
player_name = a.text
categories[i].append(player_name)
print(ws48_list)
print(ws_list)
You can use a function to define your scraping logic, then just call it for your urls.
import os
import requests
from bs4 import BeautifulSoup as bs
def scrape(url):
response = requests.get(url)
soup = bs(response.content, 'html.parser')
section = soup.find('table', class_='stats_table')
names = []
for a in section.find_all('a'):
player_name = a.text
names.append(player_name)
return names
ws_list = scrape('https://www.basketball-reference.com/leaders/ws_career.html')
ws48_list = scrape('https://www.basketball-reference.com/leaders/ws_per_48_career.html')
print(ws_list)
print(ws48_list)

Could not get link from html content using python

Here is the URL that I'am using:
http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i
In fact on this page, the link that I am looking for appears may be 5 second after loading the page.
I see after 5 second a post request to :
http://www.protect-stream.com/secur.php
with data like so :
k=2AE_a,LHmb6kSC_c,sZNk4eNixIiPo_c,_c,Gw4ERVdriKuHJlciB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WxBkmAtz75kpYcrHzxtYt32hCYSp0WjqOQR9bY_a,ofQtw_b,
I didn't get from where the 'k' value come from ?
Is their an idea on how we could get the 'k' value using python ?
This is not going to be trivial. The k parameter value is "hidden" deep inside a script element inside nested iframes. Here is a requests + BeautifulSoup way to get to the k value:
import re
from urlparse import urljoin
# Python 3: from urllib.parse import urljoin
import requests
from bs4 import BeautifulSoup
base_url = "http://www.protect-stream.com"
with requests.Session() as session:
response = session.get("http://www.protect-stream.com/PS_DL_xODN4o5HjLuqzEX5fRNuhtobXnvL9SeiyYcPLcqaqqXayD8YaIvg9Qo80hvgj4vCQkY95XB7iqcL4aF1YC8HRg_i_i")
# get the top frame url
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="frame.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the nested frame url
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
src = soup.select_one('iframe[src^="w.php"]')["src"]
frame_url = urljoin(base_url, src)
# get the frame HTML source and extract the "k" value
response = session.get(frame_url)
soup = BeautifulSoup(response.content, "html.parser")
script = soup.find("script", text=lambda text: text and "k=" in text).get_text(strip=True)
k_value = re.search(r'var k="(.*?)";', script).group(1)
print(k_value)
Prints:
YjfH9430zztSYgf7ItQJ4grv2cvH3mT7xGwv32rTy2HiB1uuy_c,Sr7mOTQVUhVEcMlZeINICKegtzYsseabOlrDb_a,LmiP80NGUvAbK1xhbZGC6OWMtIaNF12f0mYA4O0WXhmwUC0ipkPRkLQepYHLyF1U0xvsrzHMcK2XBCeY3_a,O_b,

Categories

Resources