I am getting none by using bs4 find() function even it is exists in the html. I am trying to get all div with a class tab_content.I am finding this on this link https://sofad.qc.ca/index.php?id_product=464&controller=product&id_lang=1. So kindly suggest me how to do this in a right way.
This is the code:
from bs4 import BeautifulSoup as bs
import requests
url = 'https://sofad.qc.ca/index.php?id_category=78&controller=category&id_lang=1'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
tb = soup.find_all('a', class_='product_img_link')
for item in tb:
link = item.get('href')
r = requests.get(link)
soup = bs(r.content, 'lxml')
try:
title = soup.find('h1', {'itemprop':'name'}).text
except:
title = ''
try:
price = soup.find('span', id='our_price_display').text
except:
price = ''
try:
img = soup.find('img', id='bigpic').get('src')
except:
img = ''
try:
dv = " ".join(soup.find('div', class_='rte').text.split())
except:
dv = ''
for dvv in soup.find_all('div', class_='tab_content'):
print(dvv)
As far as I know, the syntax is findAll NOT find_all!
To find a div with a class of tab_content:
tab_content = soup.findAll('div', class_='tab_content')
FYI, there is no div with tab_content class in the html tree on that page.
Related
from bs4 import BeautifulSoup
import requests
import re
def getHTMLdocument(url):
response = requests.get(url)
return response.text
def correct_url(url1):
if not url1.startswith('https://www.parliament.gov.sg'):
url1 = f'https://www.parliament.gov.sg{url1}'
return url1
url_to_scrape = 'https://www.parliament.gov.sg/mps/list-of-current-mps'
links = []
while True:
html_document = getHTMLdocument(url_to_scrape)
soup = BeautifulSoup(html_document, 'lxml')
if soup.find_all('a', attrs={'href': re.compile("/details/")}) == []:
break
for link in soup.find_all('a', attrs={'href': re.compile("/details/")}):
if link.get('href') not in links:
links.append(correct_url(link.get('href')))
for link in links:
url = link
member_info = 'mp-designation-wrap'
**member_info = 'mp-constituency-wrap'**
page = requests.get(url)
soup = BeautifulSoup(page.text, 'lxml')
txt1 = soup.find('div', attrs={'class': member_info})
textoutput = txt1.text
print(textoutput)
break
I'm trying to separate the different categories to use save separately, however, I only get output when using the member_info = 'mp-designation-wrap' and I get a AttributeError: 'NoneType' object has no attribute 'text' when using 'mp-constituency-wrap'.
I do not understand why it is giving me different results and it would be great if someone could help me understand why it is so and point me in the right direction
Reason why you get this error is, that the element you try to select do not exist in some of your resources, so you have to check that before calling .text.
for link in links:
page = requests.get(link)
soup = BeautifulSoup(page.text, 'lxml')
text1 = e.text if (e := soup.find('div', attrs={'class': 'mp-designation-wrap'})) else None
text2 = e.text if (e := soup.find('div', attrs={'class': 'mp-constituency-wrap'})) else None
print(text2)
As i want to scrape GullAhmed Website, As my code work properly but the problem is that i am facing n page scraping error
the page01 url : https://www.gulahmedshop.com/unstitched-fabric?p=01&product_list_limit=48
and the page last url : https://www.gulahmedshop.com/unstitched-fabric?p=25&product_list_limit=48
but problem is that i can't able to extract 1162 items only page one data i got in csv file
Please help me
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://www.gulahmedshop.com/unstitched-fabric?p=01&product_list_limit=48'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'product-item-info')
#print (content)
suit =[]
for property in content:
name= property.find('a', class_ = 'product-item-link').text.strip()
#price1 = property.find('span', class_ = 'price-container price-final_price tax weee')
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='price-container price-final_price tax weee')
N_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
N_product_price = ''
# print(N_product_price)
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='old-price')
old_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
old_product_price = ''
print(old_product_price)
# print(old_product_price)
try:
Offer= property.find('div', class_ = 'label-content').text.strip()
except:
Offer=''
#print(Offer)
#try:
# linkmain = property.find('div',class_ = 'cdz-product-top')
# links = linkmain.find('a',href=True)
#except AttributeError:
# links= ''
try:
links = property.find('a',{'class': 'product-item-link'})['href']
except:
links = ''
### print( name,price,links)
image = property.find_all('img',{'class':'product-image-photo'},src=True)
for i in image:
if 'data:image' not in i['src']:
images=i['src']
# print(images)
fabric={
'productname':name,
#'product_made':product_made,
#'product_color':product_color,
'Product_Sale_price': N_product_price,
'Product_Old_Price':old_product_price,
'Offer':Offer,
'product_image': images,
#'Datetime': current_date,
#'Member_acess': Member_acess,
'links': links,
}
suit.append(fabric)
print(len(suit))
#print( '\n\t\t\t\tGullAhmedUnstitched Embroidered Suit','\nproductname',name,'\nproductprice',price,'\nproductlink',links,'\nimagelink', image)
df = pd.DataFrame(suit)
print(df.head())
print("sAVING")
df.to_csv('E:/unstitched-fabric.csv')
You should just be able to enclose your logic in a loop which cycles through each page number. For example:
import requests
from bs4 import BeautifulSoup
import pandas as pd
suit = []
for page_number in range(1, 26):
print(f"Getting Page {page_number}")
url = f'https://www.gulahmedshop.com/unstitched-fabric?p={page_number:02}&product_list_limit=48'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'product-item-info')
#print (content)
for property in content:
name= property.find('a', class_ = 'product-item-link').text.strip()
#price1 = property.find('span', class_ = 'price-container price-final_price tax weee')
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='price-container price-final_price tax weee')
N_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
N_product_price = ''
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='old-price')
old_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
old_product_price = ''
try:
Offer = property.find('div', class_ = 'label-content').text.strip()
except:
Offer = ''
# links= ''
try:
links = property.find('a',{'class': 'product-item-link'})['href']
except:
links = ''
image = property.find_all('img',{'class':'product-image-photo'},src=True)
for i in image:
if 'data:image' not in i['src']:
images = i['src']
fabric = {
'productname':name,
#'product_made':product_made,
#'product_color':product_color,
'Product_Sale_price': N_product_price,
'Product_Old_Price':old_product_price,
'Offer':Offer,
'product_image': images,
#'Datetime': current_date,
#'Member_acess': Member_acess,
'links': links,
}
suit.append(fabric)
df = pd.DataFrame(suit)
df.to_csv('unstitched-fabric.csv', index=False)
I am trying to parse data from all pages. Parsing ends after the first page. What could be the problem?
I use pagination with the use of a regular expression.
The first page of the site and others differ in the html code, so I have to create two different functions main_1 and main_2 for the first and other pages.
If you try to run only the main_2 function, nothing will work. .CSV file will not be created.
help me please.
import requests
from bs4 import BeautifulSoup
import csv
import re
def get_html(url):
r = requests.get(url)
if r.ok:
return r.text
print(r.status_code)
def writer_csv(data):
with open('tesr.csv', 'a') as f:
writer = csv.writer(f)
writer.writerow((data['name'], data['url'], data['price']))
def get_data_page(html):
soup = BeautifulSoup(html, 'lxml')
trs = soup.find_all('tr', class_='cmc-table-row')
for tr in trs:
tds = tr.find_all('td')
try:
name = tds[1].find('a', class_='cmc-link').text.strip()
except:
name = ''
try:
url = 'https://coinmarketcap.com' + str(tds[1].find('a', class_='cmc-link').get('href'))
except:
url = ''
try:
price = tr.find('td', class_='cmc-table__cell--sort-by__price').find('a').text.strip().replace('$', '')
except:
price = ''
data = {'name': name,
'url': url,
'price': price}
writer_csv(data)
def main_1():
url_1 = 'https://coinmarketcap.com/'
get_data_page(get_html(url_1))
def main_2():
url_2 = 'https://coinmarketcap.com/2/'
while True:
get_data_page(get_html(url_2))
soup = BeautifulSoup(get_html(url_2), 'lxml')
try:
pattern = 'Next '
url_2 = 'https://coinmarketcap.com' + str(soup.find('ul', class_='pagination').find('a', text=re.compile(pattern)).get('href'))
except:
break
main_1()
main_2()
I can't extract the href attribute of anchors from page.. I tried using re library:
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
links.append(link.get('href'))
But it doesn't work, i get the error:
table_rows = soup.find('table').find_all('tr')[1:]
AttributeError: 'NoneType' object has no attribute 'find_all'
Can you help me understand better on how exactly to extract them ?
Thanks in advance.
Edit:
Full code:
import requests
from bs4 import BeautifulSoup
import re
DOMAIN_NAME = "https://www.dllr.state.md.us/employment"
BASE_URL = DOMAIN_NAME + '/warn2010.shtml'
def parse_url(url):
html_source = requests.get(url, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
data = []
table_rows = soup.find('table').find_all('tr')[1:]
for table_row in table_rows:
table_data = table_row.find_all('td')
data.append({
'notice_date': table_data[0].text,
'naics_code': table_data[1].text,
'company': table_data[2].text,
'location': table_data[3].text,
'wia_code': table_data[4].text,
'total_employees': table_data[5].text,
'effective_date': table_data[6].text,
'type_code': table_data[7].text
})
return data
def run_ingestion():
html_source = requests.get(BASE_URL, verify=False).text
soup = BeautifulSoup(html_source, 'html.parser')
for link in soup.findAll('a', attrs={'href': re.compile("^http://")}):
print(link.get('href'))
url = DOMAIN_NAME + '/' + link.get('href')
data = parse_url(url)
for row in data:
print(row)
if __name__ == '__main__':
run_ingestion()
Following your code, you should try this:
soup = BeautifulSoup(html_source, 'html.parser')
tag = soup.findAll('a', attrs={'href': re.compile("^http://")})
links = [i["href"] for i in tag]
As Beautiful Soup documentations says:
If find() can’t find anything, it returns None
That means, your soup.find('table') cannot be found.
I would go with a more succinct list comprehension where you use an attribute = value selector with starts with ^ operator
links = [link['href'] for link in soup.select("a[href^='http:']")]
I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)