Extract href attribute url from anchor

Extract href attribute url from anchor - python

I can't extract the href attribute of anchors from page.. I tried using re library:
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
But it doesn't work, i get the error:
table_rows = soup.find('table').find_all('tr')[1:]
AttributeError: 'NoneType' object has no attribute 'find_all'
Can you help me understand better on how exactly to extract them ?
Thanks in advance.
Edit:
Full code:
import requests
from bs4 import BeautifulSoup
import re
DOMAIN_NAME = "https://www.dllr.state.md.us/employment"
BASE_URL = DOMAIN_NAME + '/warn2010.shtml'
def parse_url(url):
html_source = requests.get(url, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
data = []
table_rows = soup.find('table').find_all('tr')[1:]
for table_row in table_rows:
table_data = table_row.find_all('td')
data.append({
'notice_date': table_data[0].text,
'naics_code': table_data[1].text,
'company': table_data[2].text,
'location': table_data[3].text,
'wia_code': table_data[4].text,
'total_employees': table_data[5].text,
'effective_date': table_data[6].text,
'type_code': table_data[7].text
})
return data
def run_ingestion():
html_source = requests.get(BASE_URL, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print(link.get('href'))
url = DOMAIN_NAME + '/' + link.get('href')
data = parse_url(url)
for row in data:
print(row)
if __name__ == '__main__':
run_ingestion()

Following your code, you should try this:
soup = BeautifulSoup(html_source, 'html.parser')
tag = soup.findAll('a', attrs={'href': re.compile("^http://")})
links = [i["href"] for i in tag]

As Beautiful Soup documentations says:
If find() can’t find anything, it returns None
That means, your soup.find('table') cannot be found.

I would go with a more succinct list comprehension where you use an attribute = value selector with starts with ^ operator
links = [link['href'] for link in soup.select("a[href^='http:']")]

Related

Why do I get output from one but not the other?

from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
def correct_url(url1):
if not url1.startswith('https://www.parliament.gov.sg'):
url1 = f'https://www.parliament.gov.sg{url1}'
return url1
url_to_scrape = 'https://www.parliament.gov.sg/mps/list-of-current-mps'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
for link in links:
url = link
member_info = 'mp-designation-wrap'
**member_info = 'mp-constituency-wrap'**
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt1 = soup.find('div', attrs={'class': member_info})
textoutput = txt1.text
print(textoutput)
break
I'm trying to separate the different categories to use save separately, however, I only get output when using the member_info = 'mp-designation-wrap' and I get a AttributeError: 'NoneType' object has no attribute 'text' when using 'mp-constituency-wrap'.
I do not understand why it is giving me different results and it would be great if someone could help me understand why it is so and point me in the right direction

Reason why you get this error is, that the element you try to select do not exist in some of your resources, so you have to check that before calling .text.
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
text1 = e.text if (e := soup.find('div', attrs={'class': 'mp-designation-wrap'})) else None
text2 = e.text if (e := soup.find('div', attrs={'class': 'mp-constituency-wrap'})) else None
print(text2)

How can I extract href links from a within a table th using BeautifulSoup

I am trying to create a list of all football teams/links from any one of a number of tables within the base URL: https://fbref.com/en/comps/10/stats/Championship-Stats
I would then use the link from the href to scrape each individual team's data. The href is embedded within the th tag as per below
th scope="row" class="left " data-stat="squad">Barnsley</th
a href="/en/squads/293cb36b/Barnsley-Stats">Barnsley</a
The following code gives me a list of the 'a' tags
page = "https://fbref.com/en/comps/10/Championship-Stats"
pageTree = requests.get(page)
pageSoup = BeautifulSoup(pageTree.content, 'html.parser')
Teams = pageSoup.find_all("th", {"class": "left"})
Output(for each class of 'left'):
th class="left" data-stat="squad" scope="row">
a href="/en/squads/293cb36b/Barnsley-Stats">Barnsley,
I have tried the guidance from a previous Stack question (Extract links after th in beautifulsoup)
However, the following code based on that thread produces errors
AttributeError: 'NoneType' object has no attribute 'find_parent'
def import_TeamList():
BASE_URL = "https://fbref.com/en/comps/10/Championship-Stats"
r = requests.get(BASE_URL)
soup = BeautifulSoup(r.text, 'lxml')
team_list = []
team_tr = soup.find('a', {'data-stat': 'squad'}).find_parent('tr')
for tr in reels_tr.find_next_siblings('tr'):
if tr.find('a').text != 'squad':
break
midi_list.append(BASE_URL + tr.find('a')['href'])
return TeamList

Here is a version using CSS selectors, which I find simpler than most other methods.
import requests
from bs4 import BeautifulSoup
url = 'https://fbref.com/en/comps/10/stats/Championship-Stats'
data = requests.get(url).text
soup = BeautifulSoup(data)
links = BeautifulSoup(data).select('th a')
urls = [link['href'] for link in links]
print(urls)

Is this what you're looking for?
import requests
from bs4 import BeautifulSoup as BS
from lxml import etree
with requests.Session() as session:
r = session.get('https://fbref.com/en/comps/10/stats/Championship-Stats')
r.raise_for_status()
dom = etree.HTML(str(BS(r.text, 'lxml')))
for a in dom.xpath('//th[#class="left"]/a'):
print(a.attrib['href'])

python bs4 find() return none

I am getting none by using bs4 find() function even it is exists in the html. I am trying to get all div with a class tab_content.I am finding this on this link https://sofad.qc.ca/index.php?id_product=464&controller=product&id_lang=1. So kindly suggest me how to do this in a right way.
This is the code:
from bs4 import BeautifulSoup as bs
import requests
url = 'https://sofad.qc.ca/index.php?id_category=78&controller=category&id_lang=1'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
tb = soup.find_all('a', class_='product_img_link')
for item in tb:
link = item.get('href')
r = requests.get(link)
soup = bs(r.content, 'lxml')
try:
title = soup.find('h1', {'itemprop':'name'}).text
except:
title = ''
try:
price = soup.find('span', id='our_price_display').text
except:
price = ''
try:
img = soup.find('img', id='bigpic').get('src')
except:
img = ''
try:
dv = " ".join(soup.find('div', class_='rte').text.split())
except:
dv = ''
for dvv in soup.find_all('div', class_='tab_content'):
print(dvv)

As far as I know, the syntax is findAll NOT find_all!
To find a div with a class of tab_content:
tab_content = soup.findAll('div', class_='tab_content')
FYI, there is no div with tab_content class in the html tree on that page.

why is my scraper showing 'none' as the output?

from bs4 import BeautifulSoup
import requests
url = 'https://www.flipkart.com'
data = {
'q': 'laptops'
}
html_code = requests.post(url, data=data).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')

you can try this code
import requests
from bs4 import BeautifulSoup
to_search = 'laptops'
url = 'https://www.flipkart.com/search?q=
{}&otracker=search&otracker1=search&marketplace=FLIPKART&as-
show=on&as=off'.format(to_search)
html_code = requests.get(url).text
soup = BeautifulSoup(html_code, 'lxml')
titles = soup.findAll('div', class_='_3wU53n')
for title in titles:
print(title, end='\n')

Can you please check again. Firstly, The url 'https://www.flipkart.com' is actually a 'GET' request and not 'POST'. Secondly, the class: _3wU53n which you have provided is actually not present in the HTML DOM. Have a look in the below screenshot for the 'GET' request and 'Absence of class: _3wU53n' in the HTML DOM.

Beautiful Soup find nested div

I am trying to parse a webpage that looks like this with Python->Beautiful Soup
see image
I need data from
<div class="p-offer__price-new">199,99 ₽</div>
I tried this code:
soup = BeautifulSoup(data)
res = soup.findAll("div", {"class": "poffer__price-new"})
print(res)
But result is empty -- []
How can I get this data?
Example of URL: https://edadeal.ru/moskva/offers/d71b75ff-bfee-4731-95ad-52a24ddea72e?from=%2F

import bs4
from selenium import webdriver
driver = webdriver.Chrome('C:\chromedriver_win32\chromedriver.exe')
driver.get('https://edadeal.ru/moskva/offers/d71b75ff-bfee-4731-95ad-52a24ddea72e?from=%2F')
html = driver.page_source
soup = bs4.BeautifulSoup(html,'html.parser')
res = soup.findAll("div", {"class": "p-offer__price-new"})
print (res[0].text)
driver.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Extract href attribute url from anchor - python

Following your code, you should try this: soup = BeautifulSoup(html_source, 'html.parser') tag = soup.findAll('a', attrs={'href': re.compile("^http://")}) links = [i["href"] for i in tag]

As Beautiful Soup documentations says: If find() can’t find anything, it returns None That means, your soup.find('table') cannot be found.

I would go with a more succinct list comprehension where you use an attribute = value selector with starts with ^ operator links = [link['href'] for link in soup.select("a[href^='http:']")]

Related

Why do I get output from one but not the other?

How can I extract href links from a within a table th using BeautifulSoup

python bs4 find() return none

why is my scraper showing 'none' as the output?

Beautiful Soup find nested div

Categories

Resources