Python Scraper brings only 1 item...
I'm relatively new to python and well I made a script to scrap one of my country's classified page. So far the script only seems to be able to grab only one item which really is driving me nuts because I've been trying to fix it for a week now and well I don't really know anyone that can help. Id appreciate it if anyone could take a look and try to explain me what exactly am I doing wrong here.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
for container in containers:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text # comment here later
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()
I look at the site structure and you're missing the parsing of the table after the form.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
tr = containers[0].findAll('tr', {"valign":"middle"})
for container in tr:
if len(container.findAll("a", {"class":"Tahoma16Blacknounder"})) > 0:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text if len(prices_container) > 0 else ''
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()
Related
Very new to Python and currently stumped at the moment. What keeps happening is the first several rows work correctly, print, and write to the file just fine, but once it gets to row 11, it throws a "list index out of range" error, and does that for the remaining rows as well.
Cant figure out for the life of me how once it fails at row 11, to start over on row 12 and run the try again (should be successful on remaining rows)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://coinmarketcap.com/"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
rows = page_soup.find("tbody").findAll("tr")
filename = "coins.csv"
f = open(filename, "w")
headers = "Rank, Name, Price, Circulating Supply\n"
f.write(headers)
for row in rows:
try:
rank_number = row.findAll('td')[1].text
coin_name = row.findAll('td')[2].text
coin_price = row.findAll('td')[3].text
supply = row.findAll('td')[8].text
print(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
f.write(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
except Exception as e:
print(e)
continue
f.close()
Any help would be greatly appreciated!
Thanks!
Would highly appreciate your support, I'm using Python BeautifulSoup:
I need to simply run this code on multiple pages (aka scrape the same data on pages 1 to 1290). I am new to this and I can imagine it is not so complicated since the URL is pretty straight forward with the page numbers
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page=1&t=3000'
#Opening the connection and grabbing the page
uClient = uReq(my_url)
#offload page content into a variable
page_html = uClient.read()
uClient.close()
#html parsing
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
##DATA
for contain_cards in cards:
#TITLE
title_container = contain_cards.findAll("h2",{"class":"card__title card__title-link"})
title = title_container[0].text
#ADDRESS
address_container = contain_cards.findAll("span",{"class":"card__location-text"})
address = address_container[0].text
#PRICE
price_container = contain_cards.findAll("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","")
#BEDROOMS
bedrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
#BATHROOMS
bathrooms_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.findAll("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "\n" )
f.close()
Try something like this:
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
# file creation
num = 1
filename = "propertyfinder.csv"
with open(filename, 'w') as f:
headers = "title,address,area,bedrooms,bathrooms,price\n"
f.write(headers)
while True:
my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={num}&t=3000'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
cards = page_soup.findAll("div", {"class": "card__content"})
contain_cards = cards[0]
try:
for contain_cards in cards:
# TITLE
title_container = contain_cards.findAll("h2", {"class": "card__title card__title-link"})
title = title_container[0].text
# ADDRESS
address_container = contain_cards.findAll("span", {"class": "card__location-text"})
address = address_container[0].text
# PRICE
price_container = contain_cards.findAll("span", {"class": "card__price-value"})
price = (price_container[0].text.strip()).replace("EGP", "")
# BEDROOMS
bedrooms_container = contain_cards.findAll("p",
{"class": "card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip()
# BATHROOMS
bathrooms_container = contain_cards.findAll("p",
{"class": "card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
# AREA
area_container = contain_cards.findAll("p", {"class": "card__property-amenity card__property-amenity--area"})
area = area_container[0].text
# CLOSING
print(title)
print(address)
print(area)
print(bedrooms)
print(bathrooms)
print(price)
f.write(title.replace(",", "|") + "," + address.replace(",",
"|") + "," + area + "," + bedrooms + "," + bathrooms + "," + price.replace(
",", "") + "\n")
except:
pass
num+=1
if num > 1290:
break
Note i bypass some UnicodeEncodeError with the try and except but i give you the idea how to run multi pages in the script
Figured it out as following for anyone's reference:
from bs4 import BeautifulSoup
import requests
def scrape_properties(page):
my_url = f'https://www.propertyfinder.eg/en/search?c=1&ob=mr&page={page}&t=3000'
#Opening the connection and grabbing the page
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
'referer': 'https://google.com',
}
response = requests.get(my_url, headers=headers)
#html parsing
page_soup = BeautifulSoup(response.text, "html.parser")
cards = page_soup.find_all("div",{"class":"card__content"})
contain_cards = cards[0]
#file creation
filename = "propertyfinder.csv"
if page == 1:
f = open(filename, "w")
headers = "title,address,area,bedrooms,bathrooms,price,ptype\n"
f.write(headers)
else:
f = open(filename, "a")
##DATA
for contain_cards in cards:
try:
#TITLE
title_container = contain_cards.find_all("h2",{"class":"card__title card__title-link"})
title = title_container[0].text.strip()
#ADDRESS
address_container = contain_cards.find_all("span",{"class":"card__location-text"})
address = address_container[0].text.strip()
#PRICE
price_container = contain_cards.find_all("span",{"class":"card__price-value"})
price = (price_container[0].text.strip()).replace("EGP","").strip()
#BEDROOMS
bedrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bedrooms"})
bedrooms = bedrooms_container[0].text.strip().strip()
#BATHROOMS
bathrooms_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--bathrooms"})
bathrooms = bathrooms_container[0].text.strip()
#AREA
area_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--area"})
area = area_container[0].text.strip()
#PTYPE
ptype_container = contain_cards.find_all("p",{"class":"card__property-amenity card__property-amenity--property-type"})
ptype = ptype_container[0].text.strip()
#CLOSING
print (title)
print (address)
print (area)
print (bedrooms)
print (bathrooms)
print (price)
print (ptype)
f.write(title.replace(",","|") + "," + address.replace(",","|") + "," + area.replace(",","") + "," + bedrooms + "," + bathrooms + "," + price.replace(",","") + "," + ptype + "\n" )
except:
pass
f.close()
for page in range(1, 100):
scrape_properties(page)
Recently I've tried to code a yp.com list scraper. But could not figure out why the code is printing only one row in the .csv file.
yp_urls.txt urls are:
https://www.yellowpages.com/search-map?search_terms=restaurant&geo_location_terms=Boston
https://www.yellowpages.com/search-map?search_terms=restaurant&geo_location_terms=Boston&page=2
Here is the code:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
with open('yp_urls.txt', 'r') as f:
for url in f:
print(url)
uClient = urlopen(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"v-card"})
#container= containers[0]
out_filename = "yp_listing.csv"
headers = "URL \n"
f = open(out_filename, "w")
f.write(headers)
for container in containers:
business = container.a["href"].title()
print("business:" + business + "\n" )
f.write(business + "," + "\n")
f.close() # Close the file
Issues:
Code for your if blocks wasn't properly indented.
Open output file handle outside the for loop.
Try:
from urllib.request import urlopen
from bs4 import BeautifulSoup as soup
out_filename = "yp_listing.csv"
with open('yp_urls.txt', 'r') as f, open(out_filename, "w") as fout:
headers = "URL \n"
fout.write(headers)
for url in f:
print(url)
uClient = urlopen(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"v-card"})
#container= containers[0]
for container in containers:
business = container.a["href"].title()
print("business:" + business + "\n" )
fout.write(business + "," + "\n")
#f.close() # Close the file (closed by with)
It appears that the f.write commands are outside of your loops, so are only being hit once the loops are completed.
For example, the code loops through the urls, then exits the loop and executes f.write(headers), then loops through containers, exits that loop and f.write(business:..)
You may also wish to check if the output file is being opened in right state with 'w' (write/overwrite) versus 'a' (append). Perhaps also consider changing the handles so both are not 'f'.
I have this code which is all work fine with (one link )
Result of the code store values (availableOffers,otherpricess,currentprice,page_url) in (prices.csv) file
my problems are : First : I do not know what to write to fetch URLs form my (text file) or my (xml file) instead of one URL in this code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "XXXXXXXXX"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
f = open(out_filename, "w")
f.write(headers)
f.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
f.close()
Second problem : when URL do not have value for (otherpricess ) I get this error
line 13, in <module>
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
AttributeError: 'NoneType' object has no attribute 'text'
how I bypass this error and tell the code to work even there are a value missing
thanks
To fetch urls from text file, you can open a file (exactly as you did for write) in "r" mode, and iterate over it's line.
For example, lets say you have the following urls file, named urls.txt:
http://www.google.com
http://www.yahoo.com
In order to fetch the urls and iterate over them, do the following:
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("urls.txt", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()): # the strip is to remove the trailing '\n'
print(url)
uClient = uReq(url)
page_soup = soup(uClient.read(), "html.parser")
# write the rest logic here
# ...
# write to the output file
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
Regarding your second question, you can check that page_soup.find("span", {"class": "price"}) is not None and if so, extract the text. For example:
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "") if page_soup.find("span", {"class": "price"}) else ""
# in case there is no value, otherpricess will be empty string but you can change it to any other value.
I am an absolute beginner, but I have managed to make a working script out of some existing scripts and tutorials. Only one thing I would like to have, unfortunately I can not do that.
So far, I'm getting data from a website that is, for example, "http://www.example.com/01536496/.../". Now I have a list (.csv or .txt) with many other numbers in the first column (or in txt-file each number in a new row). Now I want to scrape the web data for all the numbers in the list, so "http://www.example.com/No_1/.../", "http://www.example.com/No_2/.../" and so on.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import datetime
my_url = 'http://www.example.com/104289633/.../'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
...
Update
For example I have a numbers.txt with: 05543486 3468169 36189994
Now I want to put each number into the url...
Please can someone help me. I would be very grateful.
Update
After trying to use the code from Andersson...
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import datetime
# Get list of numbers
with open("numbers.txt") as f:
content = f.read()
numbers = content.split()
# Handle each URL in a loop
for number in numbers:
my_url = 'https://www.immobilienscout24.de/expose/%s#/' %number
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
print(my_url)
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all("div", {"class":"grid-item padding-desk-right-xl desk-two-thirds lap-one-whole desk-column-left flex-item palm--flex__order--1 lap--flex__order--1"})
filename = "results_"+current_datetime+".csv"
f = open(filename, "w")
headers = "titel##adresse##criteria##preis##energie##beschreibung##ausstattung##lage\n"
f.write(headers)
...
f.write(titel + "##" + adresse + "##" + criteria.replace(" ", "; ") + "##" + preis.replace(" ", "; ") + "##" + energie.replace(" ", "; ") + "##" + beschreibung.replace("\n", " ") + "##" + ausstattung.replace("\n", " ") + "##" + lage.replace("\n", " ") + "\n")
f.close()
You can use below code:
# Get list of numbers
with open("/path/to/numbers.txt") as f:
content = f.read()
numbers = content.split()
# Handle each URL in a loop
for number in numbers:
url = 'http://www.example.com/%s' % number
# Do something with url
You can create a function that runs a for loop and update the url on each iteration through the loop. As the argument, you can pass the list of numbers. For example:
def scrape(numbers):
for num in numbers:
my_url = 'http://www.example.com/No_' + str(num) + '/.../'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
numbers_list = [1, 2, 3, 4, 5]
scrape(numbers_list)
You can achieve this by appending the numbers at the end of your url with a basic for loop ? I am not sure if this is what you need.
...
with open('yourFile', 'r') as numbersFile:
nums = numbers.readlines()
for num in nums:
url = "http://www.example.com/No_" + num + "/.../"
# do what you want to do with the url...
Load from csv file
You can iterate over the file rows in various ways, but what I think is the most clean one is by using pandas.
You just need to do this:
import pandas as pd
df = pd.read_csv("filename.csv")
# assuming that filename.csv's first line has a header called "Numbers"
# You can apply a function `func` to each element of the column via `map`
df['Numbers'].map(func)
Urls from Numbers
Using pandas' map function, we can pass each value to a function to create our url.
# First of all, we define this function
def numberToUrl(number):
# We can use python's `string.format()` to format a string
return 'http://www.example.com/{}/.../'.format(number)
# Then we can pass this function to each value with `map`
# and assign the result to a new column
df['url'] = df['Numbers'].map(numberToUrl)
# We can print the first 5 elements via:
df.head()
As you can see, it's extremely simple to pass a function to each row.
If you want to iterate over the rows you can do it like so:
for (index, row) in df['url'].iteritems():
# Do your operations here
In your case it would be something like this:
for (index, row) in df['url'].iteritems():
uClient = uReq(row)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# ...
Additional notes
I would not recommend to use urllib.request directly. Instead you could use a wrapper library called requests