As i want to scrape GullAhmed Website, As my code work properly but the problem is that i am facing n page scraping error
the page01 url : https://www.gulahmedshop.com/unstitched-fabric?p=01&product_list_limit=48
and the page last url : https://www.gulahmedshop.com/unstitched-fabric?p=25&product_list_limit=48
but problem is that i can't able to extract 1162 items only page one data i got in csv file
Please help me
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
url = 'https://www.gulahmedshop.com/unstitched-fabric?p=01&product_list_limit=48'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'product-item-info')
#print (content)
suit =[]
for property in content:
name= property.find('a', class_ = 'product-item-link').text.strip()
#price1 = property.find('span', class_ = 'price-container price-final_price tax weee')
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='price-container price-final_price tax weee')
N_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
N_product_price = ''
# print(N_product_price)
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='old-price')
old_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
old_product_price = ''
print(old_product_price)
# print(old_product_price)
try:
Offer= property.find('div', class_ = 'label-content').text.strip()
except:
Offer=''
#print(Offer)
#try:
# linkmain = property.find('div',class_ = 'cdz-product-top')
# links = linkmain.find('a',href=True)
#except AttributeError:
# links= ''
try:
links = property.find('a',{'class': 'product-item-link'})['href']
except:
links = ''
### print( name,price,links)
image = property.find_all('img',{'class':'product-image-photo'},src=True)
for i in image:
if 'data:image' not in i['src']:
images=i['src']
# print(images)
fabric={
'productname':name,
#'product_made':product_made,
#'product_color':product_color,
'Product_Sale_price': N_product_price,
'Product_Old_Price':old_product_price,
'Offer':Offer,
'product_image': images,
#'Datetime': current_date,
#'Member_acess': Member_acess,
'links': links,
}
suit.append(fabric)
print(len(suit))
#print( '\n\t\t\t\tGullAhmedUnstitched Embroidered Suit','\nproductname',name,'\nproductprice',price,'\nproductlink',links,'\nimagelink', image)
df = pd.DataFrame(suit)
print(df.head())
print("sAVING")
df.to_csv('E:/unstitched-fabric.csv')
You should just be able to enclose your logic in a loop which cycles through each page number. For example:
import requests
from bs4 import BeautifulSoup
import pandas as pd
suit = []
for page_number in range(1, 26):
print(f"Getting Page {page_number}")
url = f'https://www.gulahmedshop.com/unstitched-fabric?p={page_number:02}&product_list_limit=48'
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
content = soup.find_all('div', class_ = 'product-item-info')
#print (content)
for property in content:
name= property.find('a', class_ = 'product-item-link').text.strip()
#price1 = property.find('span', class_ = 'price-container price-final_price tax weee')
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='price-container price-final_price tax weee')
N_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
N_product_price = ''
try:
p_price = property.find('div',class_ ='price-box price-final_price')
p_span_price=p_price.find('span',class_='old-price')
old_product_price = p_span_price.find('span', {"class" : 'price'}).text.strip()
except AttributeError:
old_product_price = ''
try:
Offer = property.find('div', class_ = 'label-content').text.strip()
except:
Offer = ''
# links= ''
try:
links = property.find('a',{'class': 'product-item-link'})['href']
except:
links = ''
image = property.find_all('img',{'class':'product-image-photo'},src=True)
for i in image:
if 'data:image' not in i['src']:
images = i['src']
fabric = {
'productname':name,
#'product_made':product_made,
#'product_color':product_color,
'Product_Sale_price': N_product_price,
'Product_Old_Price':old_product_price,
'Offer':Offer,
'product_image': images,
#'Datetime': current_date,
#'Member_acess': Member_acess,
'links': links,
}
suit.append(fabric)
df = pd.DataFrame(suit)
df.to_csv('unstitched-fabric.csv', index=False)
Related
import pandas as pd
import datetime
import requests
from requests.exceptions import ContentDecodingError
from bs4 import BeautifulSoup
##the fucntion to help find the information from the url below.
def web_content_div(web_content, class_path):
web_content_div = web_content.find_all('div', {'class': class_path})
try:
spans = web_content_div[0].find_all('span')
texts = [span.get_text() for span in spans]
except IndexError:
texts = []
return texts
# the Function to get the information from the URL
def real_time_price(stock_code):
url = 'https://finance.yahoo.com/quote/' + stock_code + '?p=' + stock_code + '&.tsrc=fin-srch'
try:
r = requests.get(url)
web_content = BeautifulSoup(r.text, 'lxml')
texts = web_content_div(web_content, 'My(6px) Pos(r) smartphone_Mt(6px)')
if texts != []:
price, change = texts[0], texts[1]
else:
price, change = [], []
except ConnectionError:
price, change = [], []
return price, change
#assing stock = BRK-B
stock_code = ['BRK-B']
print(real_time_price('BRK-B'))
I am getting none by using bs4 find() function even it is exists in the html. I am trying to get all div with a class tab_content.I am finding this on this link https://sofad.qc.ca/index.php?id_product=464&controller=product&id_lang=1. So kindly suggest me how to do this in a right way.
This is the code:
from bs4 import BeautifulSoup as bs
import requests
url = 'https://sofad.qc.ca/index.php?id_category=78&controller=category&id_lang=1'
r = requests.get(url)
soup = bs(r.content, 'html.parser')
tb = soup.find_all('a', class_='product_img_link')
for item in tb:
link = item.get('href')
r = requests.get(link)
soup = bs(r.content, 'lxml')
try:
title = soup.find('h1', {'itemprop':'name'}).text
except:
title = ''
try:
price = soup.find('span', id='our_price_display').text
except:
price = ''
try:
img = soup.find('img', id='bigpic').get('src')
except:
img = ''
try:
dv = " ".join(soup.find('div', class_='rte').text.split())
except:
dv = ''
for dvv in soup.find_all('div', class_='tab_content'):
print(dvv)
As far as I know, the syntax is findAll NOT find_all!
To find a div with a class of tab_content:
tab_content = soup.findAll('div', class_='tab_content')
FYI, there is no div with tab_content class in the html tree on that page.
I wrote a code for web scraping, My code is ok just except two issues. From detail page, everything is ok just ISBN NO, and from main page, I need all listing URLs so that my code could scrape date from aa listings. Please guide me how can I fix this issue. Both(main page and details page )URLs are in the code. Thank you!
here is my code:
import requests
from bs4 import BeautifulSoup
import csv
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = soup.find('span',class_="title product-field",id=False).text
except:
title = 'empty'
print(title)
try:
writer = soup.find('a',class_="contributor-name",id=False).text
except:
writer = 'empty'
print(writer)
try:
original_price = soup.find('div',class_="original-price",id=False).find('span').text
except:
original_price = 'empty'
print(original_price)
try:
active_price = soup.find('div',class_="active-price",id=False).find('span').text
except:
active_price = 'empty'
print(active_price)
try:
img = soup.find('div',class_="image-actions image-container product-type-icon-container book",id=False).find('img').attrs['src']
except:
img = 'empty'
print(img)
try:
isbn = soup.find('div',class_="bookitem-secondary-metadata",id=False).find('li').attrs['ISBN: ']
except:
isbn = 'empty'
print(isbn)
data = {
'title' : title,
'writer' : writer,
'original_price' : original_price,
'active_price' : active_price,
'image' : img,
'isbn' : isbn
}
return data
def get_index_data(soup):
titles_link = soup.find_all('a',class_="body_link_11")
try:
inks = soup.find('div', class_="item-info",id=False).find('p').find('a').get('href')
except:
inks = "empty"
print(inks)
def main():
#detail_page_url = "https://www.kobo.com/ww/en/ebook/mum-dad-1"
mainurl = "https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q"
#get_page(url)
#get_detail_data(get_page(detail_page_url))
get_index_data(get_page(mainurl))
if __name__ == '__main__':
main()
import requests
import re
import json
from bs4 import BeautifulSoup
import csv
def Soup(content):
soup = BeautifulSoup(content, 'html.parser')
return soup
def Main(url):
r = requests.get(url)
soup = Soup(r.content)
scripts = soup.findAll("script", type="application/ld+json",
text=re.compile("data"))
prices = [span.text for span in soup.select(
"p.product-field.price span span") if span.text != "USD"]
with open("data.csv", 'w', newline="") as f:
writer = csv.writer(f)
writer.writerow(["Title", "Writer", "Price", "ISBN", "IMG", "URL"])
for script, price in zip(scripts, prices):
script = json.loads(script.text)
title = script["data"]["name"]
author = script["data"]["author"][0]["name"]
img = f'https:{script["data"]["thumbnailUrl"]}'
isbn = script["data"]["isbn"]
url = script["data"]["url"]
writer.writerow([title, author, price, isbn, img, url])
Main("https://www.kobo.com/ww/en/list/new-hot-in-fiction/youL53408U25RHrVu3wR5Q")
Output: View-Online
Output Sample:
My code is accurate for single page but when I run this code for multiple records using for loop and if there are some data missing like person then (as I used index no[1] and [2] for person variable ,location, phone no and cell no but if there are something missing like person name is missing) next record will be extracted at person variable. Could you please fix this issue?
here is my code:
import requests
from bs4 import BeautifulSoup
import re
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml') # 1. html , 2. parser
return soup
def get_detail_data(soup):
#soup = BeautifulSoup(r.text, 'html.parser')
try:
title = soup.find("h1", {'class': 'sc-AykKI'}).text
except:
title = 'Empty Title'
#print(title)
try:
person = soup.find(
"span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'}).text.strip()
except:
person = 'Empty Person'
#print(person)
try:
addr = soup.findAll(
"span", {'class': 'Contact__Item-sc-1giw2l4-2 kBpGee'})[1].text
except:
addr = 'Empty Address'
#print(addr)
#abn = soup.find('div', class_="box__Box-sc-1u3aqjl-0 kxddET").('a').text
#print(abn)
try:
ratting = soup.find(
"div", {'class': 'Rating__RatingText-sc-1r9ytu8-1 jIdgkl'}).text
except:
ratting = 'Empty Ratting'
#print(ratting)
try:
abn = (re.search('abn\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
abn = 'Empty ABN'
#print(abn)
try:
website = (re.search('website\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
website = 'Empty Website'
#print(website )
try:
phone = (re.search('phone\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
phone = 'Empty Phone No'
#print(phone)
try:
cell = (re.search('mobile\\\\":\\\\"(.*?)\\\\"', soup.text).group(1))
except:
cell = 'Empty Cell No'
#print(cell)
data = {
'title' : title,
'peron name' : person,
'address' : addr,
'phone no' : phone,
'cell no' : cell,
'abn no' : abn,
'website' : website
}
return data
def get_index_data(soup):
#soup = BeautifulSoup(r.text, 'html.parser')
titles = []
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
titles.append(urls)
#print(titles)
return titles
def Main():
url = "https://hipages.com.au/connect/abcelectricservicespl/service/126298"
mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
main_titles = get_index_data(get_page(mainurl))
for title in main_titles:
data1 = get_detail_data(get_page(title))
print(data1)
Main()
You need to parse your data from the script tag rather than the spans and divs.
Try this:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from pandas import json_normalize
import json
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml')
return soup
def get_detail_data(url):
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
raw = res.text.split("<script> window.__INITIAL_STATE__=")[1]
raw = raw.split("</script>")[0]
data = json.loads(raw)
data = json.loads(data)
cols = ['abn', 'address', 'name', 'primary_location', 'service_area', 'state', 'suburb', 'website']
df = pd.DataFrame(data["sites"]["list"]).T
df = df[cols].reset_index(drop=True)
primary_location = json_normalize(df.primary_location[0])
df = pd.concat([df, primary_location], axis=1)
to_drop = ["primary_location", "is_primary", "suburb_seo_key", "capital_city_seo_key"]
df.drop(to_drop, axis=1, inplace=True)
return df
def get_index_data(soup):
titles = []
for item in soup.findAll("h3", {'class': 'sc-bZQynM sc-iwsKbI dpKmnV'}):
urls = (f"https://hipages.com.au{item.previous_element.get('href')}")
titles.append(urls)
return titles
def Main():
mainurl = "https://hipages.com.au/find/antenna_services/nsw/sydney"
main_titles = get_index_data(get_page(mainurl))
final_data = []
for title in main_titles:
data = get_detail_data(title)
final_data.append(data)
return final_data
data = Main()
df = pd.concat(data).reset_index(drop=True)
display(df)
This gives you much more detailed data by the way.
It is a public website containg data of compaines in Sweden. I want to extract data like (Title of companies, Email Link and Location ) from first 10 pages. I write code but it does not provide right result so please help me out. Thank you!
import requests
from bs4 import BeautifulSoup
import pandas as pd
def get_page(url):
response = requests.get(url)
if not response.ok:
print('server responded:', response.status_code)
else:
soup = BeautifulSoup(response.text, 'lxml') # 1. html , 2. parser
return soup
def get_detail_data(soup):
try:
title = (soup.find('h1',class_="Name",id=False).text)
except:
title = 'empty'
try:
address = (soup.find('p','font', class_=False,id=False).text)
except:
address = 'No location'
try:
email = soup.select_one("a[href^='mailto:']").get("href").split("mailto:")[1]
except:
email = 'No email'
data = {
'title' : title,
'address' : address,
'email' : email
}
return data
def get_index_data(soup):
try:
titles_link = soup.find_all('div', {'class': 'Name'})
for title in titles_link:
link = soup.find('a', {'title': title.text}).get('href')
titles = ('https://www.118100.se' + link)
except:
titles_link = []
return titles
def main():
url = "https://www.118100.se/sok/foretag/?q=brf&loc=&ob=rel&p=0"
#get_page(url)
#get_detail_data(get_page(url))
products = get_index_data(get_page(url))
for product in products:
data = get_detail_data(get_page(product))
print(data)
if __name__ == '__main__':
main()
The problem is the function get_index_data. You are not returning a list, you are returning a single link.
You should convert the variable titles in a list. For example:
def get_index_data(soup):
titles = []
try:
titles_link = soup.find_all('div', {'class': 'Name'})
for title in titles_link:
link = soup.find('a', {'title': title.text}).get('href')
titles.append('https://www.118100.se' + link)
except:
return []
return titles