Adjust python (beautiful soup) code to scrape multiple pages

Adjust python (beautiful soup) code to scrape multiple pages - python

Would highly appreciate your support, I'm using Python BeautifulSoup:
I need to simply run this code on multiple pages (aka scrape the same data on pages 1 to 1290). I am new to this and I can imagine it is not so complicated since the URL is pretty straight forward with the page numbers
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000'
#Opening the connection and grabbing the page
uClient = uReq(my_url)
#offload page content into a variable
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
##DATA
for contain_cards in cards:
#TITLE
title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"})
title = title_container[0].text
#ADDRESS
address_container = contain_cards.findAll("span",{"class":"card__location-text"})
address = address_container[0].text
#PRICE
price_container = contain_cards.findAll("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","")
#BEDROOMS
bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
#BATHROOMS
bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" )
f.close()

Try something like this:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
# file creation
num = 1
filename = "propertyfinder.csv"
with open(filename, 'w') as f:
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
while True:
my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={num}&t=3000'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": "card__content"})
contain_cards = cards[0]
try:
for contain_cards in cards:
# TITLE
title_container = contain_cards.findAll("h2", {"class": "card__title card__title-link"})
title = title_container[0].text
# ADDRESS
address_container = contain_cards.findAll("span", {"class": "card__location-text"})
address = address_container[0].text
# PRICE
price_container = contain_cards.findAll("span", {"class": "card__price-value"})
price = (price_container[0].text.strip()).replace("EGP", "")
# BEDROOMS
bedrooms_container = contain_cards.findAll("p",
{"class": "card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
# BATHROOMS
bathrooms_container = contain_cards.findAll("p",
{"class": "card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
# AREA
area_container = contain_cards.findAll("p", {"class": "card__property-amenity card__property-amenity--area"})
area = area_container[0].text
# CLOSING
print(title)
print(address)
print(area)
print(bedrooms)
print(bathrooms)
print(price)
f.write(title.replace(",", "|") + "," + address.replace(",",
"|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(
",", "") + "\n")
except:
pass
num+=1
if num > 1290:
break
Note i bypass some UnicodeEncodeError with the try and except but i give you the idea how to run multi pages in the script

Figured it out as following for anyone's reference:
from bs4 import BeautifulSoup
import requests
def scrape_properties(page):
my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000'
#Opening the connection and grabbing the page
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'referer': 'https://google.com',
}
response = requests.get(my_url, headers=headers)
#html parsing
page_soup = BeautifulSoup(response.text, "html.parser")
cards = page_soup.find_all("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
if page == 1:
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price,ptype\n"
f.write(headers)
else:
f = open(filename, "a")
##DATA
for contain_cards in cards:
try:
#TITLE
title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"})
title = title_container[0].text.strip()
#ADDRESS
address_container = contain_cards.find_all("span",{"class":"card__location-text"})
address = address_container[0].text.strip()
#PRICE
price_container = contain_cards.find_all("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","").strip()
#BEDROOMS
bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip().strip()
#BATHROOMS
bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text.strip()
#PTYPE
ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--property-type"})
ptype = ptype_container[0].text.strip()
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
print (ptype)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" )
except:
pass
f.close()
for page in range(1, 100):
scrape_properties(page)

Related

Python getting stuck in exception flow

Very new to Python and currently stumped at the moment. What keeps happening is the first several rows work correctly, print, and write to the file just fine, but once it gets to row 11, it throws a "list index out of range" error, and does that for the remaining rows as well.
Cant figure out for the life of me how once it fails at row 11, to start over on row 12 and run the try again (should be successful on remaining rows)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://coinmarketcap.com/"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
rows = page_soup.find("tbody").findAll("tr")
filename = "coins.csv"
f = open(filename, "w")
headers = "Rank, Name, Price, Circulating Supply\n"
f.write(headers)
for row in rows:
try:
rank_number = row.findAll('td')[1].text
coin_name = row.findAll('td')[2].text
coin_price = row.findAll('td')[3].text
supply = row.findAll('td')[8].text
print(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
f.write(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
except Exception as e:
print(e)
continue
f.close()
Any help would be greatly appreciated!
Thanks!

Having a Syntax error in the code of Web Scraping (Python 3)?

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.flipkart.com/search?q=iphone+12&sid=tyy%2C4io&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_1_6_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_1_6_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=iphone+12%7CMobiles&requestId=71ed5a8e-4348-4fef-9af8-43b7be8c4d83'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": "_13oc-S"})
#print(len(containers)) "will tell number of products on the respected page"
#print(len(containers))
#print(soup.prettify(containers[0])) "will bring the page in the organised manner"
#print(soup.prettify(containers[0]))
container=containers[0]
#print(container.div.img["alt"]) "will display the name of the respected product"
#print(container.div.img["alt"])
#price=container.findAll("div",{"class":"col col-5-12 nlI3QM"}) "will tell the price of the respect project"
price=container.findAll("div",{"class":"col col-5-12 nlI3QM"})
#print(price[0].text)
ratings=container.findAll("div",{"class":"gUuXy-"})
#print(ratings[0].text)
#Making a file
filename="products.csv"
f= open(filename, "w")
#Naming the headers
headers="Product_Name,Pricing,Ratings\n"
f.write(headers)
for container in containers:
product_name = container.div.img["alt"]
price_container = container.findAll("div", {"class": "col col-5-12 nlI3QM"})
price = price_container[0].text.strip()
rating_container = container.findAll("div", {"class":"gUuXy-"})
rating = rating_container[0].text
#print("product_name:" + product_name)
#print("price:" + price)
#print("ratings:" + rating)
#string parsing
trim_price = ''.join(price.split(','))
rm_rupee = trim_price.split("&#8377")
add_rs_price = "Rs." + rm_rupee[0]
split_price = add_rs_price.split('E')
final_price = split_price[0]
split_rating = rating.split(" ")
final_rating = split_rating[0]
print(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
f.write(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
f.close()
f.write(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
Having a syntax error in this specific line ,
I want to make a .CSV file but the products are not coming in the respected file.
The Syntax error is -:
Exception has occurred: UnicodeEncodeError
'charmap' codec can't encode character '\u20b9' in position 35: character maps to
File "D:\Visual Code Folder\Python\Scraping_Flipkart.py", line 61, in
f.write(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")

Replace this
f= open(filename, "w")
with this
import io
f = io.open(filename, "w", encoding="utf-8")
Using io gives you backward compatibility with Python 2.
If you only need to support Python 3 you can use the builtin open function instead:
with open(fname, "w", encoding="utf-8") as f:
f.write(html)

in python, what should I add to fetch URLs form my (text file) or my (xml file) which include list of URLs?

I have this code which is all work fine with (one link )
Result of the code store values (availableOffers,otherpricess,currentprice,page_url) in (prices.csv) file
my problems are : First : I do not know what to write to fetch URLs form my (text file) or my (xml file) instead of one URL in this code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "XXXXXXXXX"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
f = open(out_filename, "w")
f.write(headers)
f.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
f.close()
Second problem : when URL do not have value for (otherpricess ) I get this error
line 13, in <module>
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
AttributeError: 'NoneType' object has no attribute 'text'
how I bypass this error and tell the code to work even there are a value missing
thanks

To fetch urls from text file, you can open a file (exactly as you did for write) in "r" mode, and iterate over it's line.
For example, lets say you have the following urls file, named urls.txt:
http://www.google.com
http://www.yahoo.com
In order to fetch the urls and iterate over them, do the following:
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("urls.txt", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()): # the strip is to remove the trailing '\n'
print(url)
uClient = uReq(url)
page_soup = soup(uClient.read(), "html.parser")
# write the rest logic here
# ...
# write to the output file
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
Regarding your second question, you can check that page_soup.find("span", {"class": "price"}) is not None and if so, extract the text. For example:
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "") if page_soup.find("span", {"class": "price"}) else ""
# in case there is no value, otherpricess will be empty string but you can change it to any other value.

Python - IndexError: list index out of range - not working

This is my scrap.py code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
website = "https://houston.craigslist.org/search/cta"
uClient = uReq(website)
page_html = uClient.read()
uClient.close()
soup_html = soup(page_html, "html.parser")
result_html = soup_html.findAll("p", {"class":"result-info"})
filename = "products.csv"
f = open(filename, "w", encoding='utf8')
headers = "car_name, price\n"
f.write(headers)
for container in result_html:
carname = container.a.text
price_container = container.findAll('span', {'class':'result-price'})
price = price_container[0].text
f.write(carname + "," + price + "\n")
f.close()
On terminal, it works fine however when I loop it, it gives the following error..
Traceback (most recent call last):
File "scrap.py", line 23, in <module>
price = price_container[0].text.splitlines()
IndexError: list index out of range
Please help. Thanks

Try the below one. It will fetch you all the items and price and handle IndexError if there is any.
from bs4 import BeautifulSoup
from urllib.request import urlopen
response = urlopen("https://houston.craigslist.org/search/cta")
soup_html = BeautifulSoup(response.read(), "html.parser")
for container in soup_html.find_all("p", {"class":"result-info"}):
carname = container.find_all("a")[0].text
try:
price = container.find_all('span', {'class':'result-price'})[0].text
except IndexError:
price = ""
print(carname,price)
I tried to shorten your code to make it look better.

This is because some cars just have no price, e.g. this one. You can put price to unknown if there was no price:
price_container = container.findAll('span', {'class':'result-price'})
if len(price_container) > 0:
price = price_container[0].text
else:
price = 'unknown'
Or you could just skip the ones without price so they'll not get written to the file:
price_container = container.findAll('span', {'class':'result-price'})
if len(price_container) == 0:
continue
price = price_container[0].text
How can I sort it by price?
results = []
for container in result_html:
carname = container.a.text
price_container = container.findAll('span', {'class':'result-price'})
if len(price_container) == 0:
continue
price = price_container[0].text.strip('$')
results.append((int(price), carname))
for price, carname in sorted(results):
f.write("{}, {}\n".format(carname, price))
f.close()

Python script only scrapes one item (Classified page)

Python Scraper brings only 1 item...
I'm relatively new to python and well I made a script to scrap one of my country's classified page. So far the script only seems to be able to grab only one item which really is driving me nuts because I've been trying to fix it for a week now and well I don't really know anyone that can help. Id appreciate it if anyone could take a look and try to explain me what exactly am I doing wrong here.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
for container in containers:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text # comment here later
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()

I look at the site structure and you're missing the parsing of the table after the form.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
tr = containers[0].findAll('tr', {"valign":"middle"})
for container in tr:
if len(container.findAll("a", {"class":"Tahoma16Blacknounder"})) > 0:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text if len(prices_container) > 0 else ''
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Adjust python (beautiful soup) code to scrape multiple pages - python

Related

Python getting stuck in exception flow

Having a Syntax error in the code of Web Scraping (Python 3)?

in python, what should I add to fetch URLs form my (text file) or my (xml file) which include list of URLs?

Python - IndexError: list index out of range - not working

Python script only scrapes one item (Classified page)

Categories

Resources