Why do I get output from one but not the other? - python

from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
def correct_url(url1):
if not url1.startswith('https://www.parliament.gov.sg'):
url1 = f'https://www.parliament.gov.sg{url1}'
return url1
url_to_scrape = 'https://www.parliament.gov.sg/mps/list-of-current-mps'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
for link in links:
url = link
member_info = 'mp-designation-wrap'
**member_info = 'mp-constituency-wrap'**
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt1 = soup.find('div', attrs={'class': member_info})
textoutput = txt1.text
print(textoutput)
break
I'm trying to separate the different categories to use save separately, however, I only get output when using the member_info = 'mp-designation-wrap' and I get a AttributeError: 'NoneType' object has no attribute 'text' when using 'mp-constituency-wrap'.
I do not understand why it is giving me different results and it would be great if someone could help me understand why it is so and point me in the right direction

Reason why you get this error is, that the element you try to select do not exist in some of your resources, so you have to check that before calling .text.
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
text1 = e.text if (e := soup.find('div', attrs={'class': 'mp-designation-wrap'})) else None
text2 = e.text if (e := soup.find('div', attrs={'class': 'mp-constituency-wrap'})) else None
print(text2)

Related

python bs4 find() return none

I am getting none by using bs4 find() function even it is exists in the html. I am trying to get all div with a class tab_content.I am finding this on this link https://sofad.qc.ca/index.php?id_product=464&controller=product&id_lang=1. So kindly suggest me how to do this in a right way.
This is the code:
from bs4 import BeautifulSoup as bs
import requests
url = 'https://sofad.qc.ca/index.php?id_category=78&controller=category&id_lang=1'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
tb = soup.find_all('a', class_='product_img_link')
for item in tb:
link = item.get('href')
r = requests.get(link)
soup = bs(r.content, 'lxml')
try:
title = soup.find('h1', {'itemprop':'name'}).text
except:
title = ''
try:
price = soup.find('span', id='our_price_display').text
except:
price = ''
try:
img = soup.find('img', id='bigpic').get('src')
except:
img = ''
try:
dv = " ".join(soup.find('div', class_='rte').text.split())
except:
dv = ''
for dvv in soup.find_all('div', class_='tab_content'):
print(dvv)
As far as I know, the syntax is findAll NOT find_all!
To find a div with a class of tab_content:
tab_content = soup.findAll('div', class_='tab_content')
FYI, there is no div with tab_content class in the html tree on that page.

Python - getting a unique list

I am using the code below:
import requests
from bs4 import BeautifulSoup
def recursiveUrl(url, link, depth):
if depth == 5:
return url
else:
print(link['href'])
page = requests.get(url + link['href'])
soup = BeautifulSoup(page.text, 'html.parser')
newlink = soup.find('a')
if len(newlink) == 0:
return link
else:
return link, recursiveUrl(url, newlink, depth + 1)
def getLinks(url):
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
links = soup.find_all('a')
for link in links:
links.append(recursiveUrl(url, link, 0))
return links
links = getLinks("https://www.rogerebert.com/reviews/")
def unique(links):
uniqueValues = {}
for i in links:
uniqueValues.add(i)
for i in uniqueValues:
print(i)
unique(links)
I have tried a number of ways of trying to print only unique entries but my output is a long list like the below I should ideally only print on of each unique entry:
Thanks again for all the help.
You have a mistake in your code uniqueValues.add(i) as you set it to dict
previously and dict has no add !
import requests
from bs4 import BeautifulSoup
r = requests.get('https://www.rogerebert.com/reviews/')
soup = BeautifulSoup(r.text, 'html.parser')
links = set()
for item in soup.findAll('a'):
item = item.get('href')
links.add(item)
for item in links:
print(item)
Instead of using a list try using a set. That way you don't have multiple instances of the same website.
uniqueValues = {}
for i in links:
uniqueValues.add(i)
for i in uniqueValues:
print(i)

Extract href attribute url from anchor

I can't extract the href attribute of anchors from page.. I tried using re library:
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
But it doesn't work, i get the error:
table_rows = soup.find('table').find_all('tr')[1:]
AttributeError: 'NoneType' object has no attribute 'find_all'
Can you help me understand better on how exactly to extract them ?
Thanks in advance.
Edit:
Full code:
import requests
from bs4 import BeautifulSoup
import re
DOMAIN_NAME = "https://www.dllr.state.md.us/employment"
BASE_URL = DOMAIN_NAME + '/warn2010.shtml'
def parse_url(url):
html_source = requests.get(url, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
data = []
table_rows = soup.find('table').find_all('tr')[1:]
for table_row in table_rows:
table_data = table_row.find_all('td')
data.append({
'notice_date': table_data[0].text,
'naics_code': table_data[1].text,
'company': table_data[2].text,
'location': table_data[3].text,
'wia_code': table_data[4].text,
'total_employees': table_data[5].text,
'effective_date': table_data[6].text,
'type_code': table_data[7].text
})
return data
def run_ingestion():
html_source = requests.get(BASE_URL, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print(link.get('href'))
url = DOMAIN_NAME + '/' + link.get('href')
data = parse_url(url)
for row in data:
print(row)
if __name__ == '__main__':
run_ingestion()
Following your code, you should try this:
soup = BeautifulSoup(html_source, 'html.parser')
tag = soup.findAll('a', attrs={'href': re.compile("^http://")})
links = [i["href"] for i in tag]
As Beautiful Soup documentations says:
If find() can’t find anything, it returns None
That means, your soup.find('table') cannot be found.
I would go with a more succinct list comprehension where you use an attribute = value selector with starts with ^ operator
links = [link['href'] for link in soup.select("a[href^='http:']")]

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

python beautiful soup output into excel

I am trying to get the output from the python script into excel. The script works fine in Python, but when I try and do the import CSV and writerow rule it doesn't work. It says price not defined in writerow and how would I print multiple items. Any help would be appreciated.
import csv
import requests
from bs4 import BeautifulSoup
f = open('dataoutput.csv','w', newline = "")
writer = csv.writer(f)
def trade_spider(max_pages):
page = 1
while page <= max_pages:
url = 'http://www.zoopla.co.uk/for-sale/property/manchester/?identifier=manchester&q=manchester&search_source=home&radius=0&pn=' + str(page)
source_code = requests.get(url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for link in soup.findAll('a', {'class': 'listing-results-price text-price'}):
href = "http://www.zoopla.co.uk" + link.get('href')
title = link.string
get_single_item_data(href)
page +=1
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
print(item_name.get_text(strip=True))
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
print(item_fame.get_text(strip=True))
writer.writerow(price)
trade_spider(1)
The object price is not defined anywhere in your script outside of the function get_single_item_data. Outside of that function your code cannot recognize any object with that name. Also, get_single_item_data does not return anything from the BeautifulSoup object. It only prints it. You should rewrite your function to be something like this:
def get_single_item_data(item_url):
source_code = requests.get(item_url)
plain_text = source_code.text
soup = BeautifulSoup(plain_text)
#create list to contain addresses
addresses = []
for item_name in soup.findAll('div', {'class': 'listing-details-address'}):
address = item_name.string
#add each address to the list
addresses.append(address)
print(item_name.get_text(strip=True))
#create list for prices
prices = []
for item_fame in soup.findAll('div', {'class' : 'listing-details-price text-price'}):
price = item_fame.string
#add prices to list
prices.append(price)
print(item_fame.get_text(strip=True))
#alter the code to return the data structure you prefer.
return([addresses,prices])

Categories

Resources