Python getting stuck in exception flow - python

Very new to Python and currently stumped at the moment. What keeps happening is the first several rows work correctly, print, and write to the file just fine, but once it gets to row 11, it throws a "list index out of range" error, and does that for the remaining rows as well.
Cant figure out for the life of me how once it fails at row 11, to start over on row 12 and run the try again (should be successful on remaining rows)
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = "https://coinmarketcap.com/"
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
rows = page_soup.find("tbody").findAll("tr")
filename = "coins.csv"
f = open(filename, "w")
headers = "Rank, Name, Price, Circulating Supply\n"
f.write(headers)
for row in rows:
try:
rank_number = row.findAll('td')[1].text
coin_name = row.findAll('td')[2].text
coin_price = row.findAll('td')[3].text
supply = row.findAll('td')[8].text
print(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
f.write(rank_number + "," + coin_name + "," + coin_price.replace(",","") + "," + supply.replace(",","") + "\n")
except Exception as e:
print(e)
continue
f.close()
Any help would be greatly appreciated!
Thanks!

Related

Having a Syntax error in the code of Web Scraping (Python 3)?

from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
my_url = 'https://www.flipkart.com/search?q=iphone+12&sid=tyy%2C4io&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_1_6_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_1_6_na_na_na&as-pos=1&as-type=HISTORY&suggestionId=iphone+12%7CMobiles&requestId=71ed5a8e-4348-4fef-9af8-43b7be8c4d83'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div", {"class": "_13oc-S"})
#print(len(containers)) "will tell number of products on the respected page"
#print(len(containers))
#print(soup.prettify(containers[0])) "will bring the page in the organised manner"
#print(soup.prettify(containers[0]))
container=containers[0]
#print(container.div.img["alt"]) "will display the name of the respected product"
#print(container.div.img["alt"])
#price=container.findAll("div",{"class":"col col-5-12 nlI3QM"}) "will tell the price of the respect project"
price=container.findAll("div",{"class":"col col-5-12 nlI3QM"})
#print(price[0].text)
ratings=container.findAll("div",{"class":"gUuXy-"})
#print(ratings[0].text)
#Making a file
filename="products.csv"
f= open(filename, "w")
#Naming the headers
headers="Product_Name,Pricing,Ratings\n"
f.write(headers)
for container in containers:
product_name = container.div.img["alt"]
price_container = container.findAll("div", {"class": "col col-5-12 nlI3QM"})
price = price_container[0].text.strip()
rating_container = container.findAll("div", {"class":"gUuXy-"})
rating = rating_container[0].text
#print("product_name:" + product_name)
#print("price:" + price)
#print("ratings:" + rating)
#string parsing
trim_price = ''.join(price.split(','))
rm_rupee = trim_price.split("&#8377")
add_rs_price = "Rs." + rm_rupee[0]
split_price = add_rs_price.split('E')
final_price = split_price[0]
split_rating = rating.split(" ")
final_rating = split_rating[0]
print(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
f.write(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
f.close()
f.write(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
Having a syntax error in this specific line ,
I want to make a .CSV file but the products are not coming in the respected file.
The Syntax error is -:
Exception has occurred: UnicodeEncodeError
'charmap' codec can't encode character '\u20b9' in position 35: character maps to
File "D:\Visual Code Folder\Python\Scraping_Flipkart.py", line 61, in
f.write(product_name.replace(",", "|") + "," + final_price + "," + final_rating + "\n")
Replace this
f= open(filename, "w")
with this
import io
f = io.open(filename, "w", encoding="utf-8")
Using io gives you backward compatibility with Python 2.
If you only need to support Python 3 you can use the builtin open function instead:
with open(fname, "w", encoding="utf-8") as f:
f.write(html)

in python, what should I add to fetch URLs form my (text file) or my (xml file) which include list of URLs?

I have this code which is all work fine with (one link )
Result of the code store values (availableOffers,otherpricess,currentprice,page_url) in (prices.csv) file
my problems are : First : I do not know what to write to fetch URLs form my (text file) or my (xml file) instead of one URL in this code
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "XXXXXXXXX"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
availableOffers = page_soup.find("input", {"id": "availableOffers"})["value"]
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
currentprice = page_soup.find("div", {"class": "is"}).text.strip().replace("$", "")
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
f = open(out_filename, "w")
f.write(headers)
f.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
f.close()
Second problem : when URL do not have value for (otherpricess ) I get this error
line 13, in <module>
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "")
AttributeError: 'NoneType' object has no attribute 'text'
how I bypass this error and tell the code to work even there are a value missing
thanks
To fetch urls from text file, you can open a file (exactly as you did for write) in "r" mode, and iterate over it's line.
For example, lets say you have the following urls file, named urls.txt:
http://www.google.com
http://www.yahoo.com
In order to fetch the urls and iterate over them, do the following:
out_filename = "prices.csv"
headers = "availableOffers,otherpricess,currentprice,page_url \n"
with open(out_filename, "w") as fw:
fw.write(headers)
with open("urls.txt", "r") as fr:
for url in map(lambda x: x.strip(), fr.readlines()): # the strip is to remove the trailing '\n'
print(url)
uClient = uReq(url)
page_soup = soup(uClient.read(), "html.parser")
# write the rest logic here
# ...
# write to the output file
fw.write(availableOffers + ", " + otherpricess + ", " + currentprice + ", " + page_url + "\n")
Regarding your second question, you can check that page_soup.find("span", {"class": "price"}) is not None and if so, extract the text. For example:
otherpricess = page_soup.find("span", {"class": "price"}).text.replace("$", "") if page_soup.find("span", {"class": "price"}) else ""
# in case there is no value, otherpricess will be empty string but you can change it to any other value.

python 3.6 get text from list

I am an absolute beginner, but I have managed to make a working script out of some existing scripts and tutorials. Only one thing I would like to have, unfortunately I can not do that.
So far, I'm getting data from a website that is, for example, "http://www.example.com/01536496/.../". Now I have a list (.csv or .txt) with many other numbers in the first column (or in txt-file each number in a new row). Now I want to scrape the web data for all the numbers in the list, so "http://www.example.com/No_1/.../", "http://www.example.com/No_2/.../" and so on.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import datetime
my_url = 'http://www.example.com/104289633/.../'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
...
Update
For example I have a numbers.txt with: 05543486 3468169 36189994
Now I want to put each number into the url...
Please can someone help me. I would be very grateful.
Update
After trying to use the code from Andersson...
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import datetime
# Get list of numbers
with open("numbers.txt") as f:
content = f.read()
numbers = content.split()
# Handle each URL in a loop
for number in numbers:
my_url = 'https://www.immobilienscout24.de/expose/%s#/' %number
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
print(my_url)
page_soup = soup(page_html, "html.parser")
containers = page_soup.find_all("div", {"class":"grid-item padding-desk-right-xl desk-two-thirds lap-one-whole desk-column-left flex-item palm--flex__order--1 lap--flex__order--1"})
filename = "results_"+current_datetime+".csv"
f = open(filename, "w")
headers = "titel##adresse##criteria##preis##energie##beschreibung##ausstattung##lage\n"
f.write(headers)
...
f.write(titel + "##" + adresse + "##" + criteria.replace(" ", "; ") + "##" + preis.replace(" ", "; ") + "##" + energie.replace(" ", "; ") + "##" + beschreibung.replace("\n", " ") + "##" + ausstattung.replace("\n", " ") + "##" + lage.replace("\n", " ") + "\n")
f.close()
You can use below code:
# Get list of numbers
with open("/path/to/numbers.txt") as f:
content = f.read()
numbers = content.split()
# Handle each URL in a loop
for number in numbers:
url = 'http://www.example.com/%s' % number
# Do something with url
You can create a function that runs a for loop and update the url on each iteration through the loop. As the argument, you can pass the list of numbers. For example:
def scrape(numbers):
for num in numbers:
my_url = 'http://www.example.com/No_' + str(num) + '/.../'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
numbers_list = [1, 2, 3, 4, 5]
scrape(numbers_list)
You can achieve this by appending the numbers at the end of your url with a basic for loop ? I am not sure if this is what you need.
...
with open('yourFile', 'r') as numbersFile:
nums = numbers.readlines()
for num in nums:
url = "http://www.example.com/No_" + num + "/.../"
# do what you want to do with the url...
Load from csv file
You can iterate over the file rows in various ways, but what I think is the most clean one is by using pandas.
You just need to do this:
import pandas as pd
df = pd.read_csv("filename.csv")
# assuming that filename.csv's first line has a header called "Numbers"
# You can apply a function `func` to each element of the column via `map`
df['Numbers'].map(func)
Urls from Numbers
Using pandas' map function, we can pass each value to a function to create our url.
# First of all, we define this function
def numberToUrl(number):
# We can use python's `string.format()` to format a string
return 'http://www.example.com/{}/.../'.format(number)
# Then we can pass this function to each value with `map`
# and assign the result to a new column
df['url'] = df['Numbers'].map(numberToUrl)
# We can print the first 5 elements via:
df.head()
As you can see, it's extremely simple to pass a function to each row.
If you want to iterate over the rows you can do it like so:
for (index, row) in df['url'].iteritems():
# Do your operations here
In your case it would be something like this:
for (index, row) in df['url'].iteritems():
uClient = uReq(row)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
# ...
Additional notes
I would not recommend to use urllib.request directly. Instead you could use a wrapper library called requests

Python script only scrapes one item (Classified page)

Python Scraper brings only 1 item...
I'm relatively new to python and well I made a script to scrap one of my country's classified page. So far the script only seems to be able to grab only one item which really is driving me nuts because I've been trying to fix it for a week now and well I don't really know anyone that can help. Id appreciate it if anyone could take a look and try to explain me what exactly am I doing wrong here.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
for container in containers:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text # comment here later
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()
I look at the site structure and you're missing the parsing of the table after the form.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'http://www.clasificadosonline.com/UDMiscListingID.asp?MiscCat=75'
# opening ip connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
#HTML PARSER
page_soup = soup(page_html, "html5lib") #se cambio de "html.parser" a "html5lib por que jodia el closing form tag"
containers = page_soup.findAll("form",{"name":"listing"})
#testing variables
tags = containers[0].findAll("a", {"class":"Tahoma16Blacknounder"})
tagx = tags[0].text.strip()
filename = "products.csv"
f = open(filename, "w")
headers = "names, prices, city, product_condition\n"
f.write(headers)
tr = containers[0].findAll('tr', {"valign":"middle"})
for container in tr:
if len(container.findAll("a", {"class":"Tahoma16Blacknounder"})) > 0:
#holds the names of the classifieds
names_container = container.findAll("a", {"class":"Tahoma16Blacknounder"})
names = names_container[0].text.strip() # comment here later
#the span class"Tahoma14BrownNound" seems to hold the prices
#container.findAll("span", {"class":"Tahoma14BrownNound"})
#the span class
prices_container = container.findAll("span", {"class":"Tahoma14BrownNound"})
prices = prices_container[0].text if len(prices_container) > 0 else ''
#holds the city of use of the products
city_container = container.findAll("font", {"class":"tahoma14hbluenoUnder"})
city = city_container[0].text.strip() # comment here later
#holds the states of use of the products
product_condition_container = container.findAll("span", {"class":"style14 style15 style16"})
product_condition = product_condition_container[0].text # comment here later
print("names: " + names)
print("prices: " + prices)
print("city: " + city)
print("product_condition: " + product_condition)
f.write(names.replace(",", "|") + "," + prices + "," + city + "," + product_condition + "\n")
f.close()

Scraping Python Script: Looping and updating output

So I have this python script. Right now, I run the script and it gives me an output file in CSV.
What I want: When it finishes to restart and to check for changes to those output values (not refresh the output file when it restarts and erase all the previously collected data)
As well, it takes about 3 seconds per line of data to get retrieved. Does anyone know how I can get it going fast to handle large data sets?
import urllib2,re,urllib,urlparse,csv,sys,time,threading,codecs
from bs4 import BeautifulSoup
def extract(url):
try:
sys.stdout.write('0')
global file
page = urllib2.urlopen(url).read()
soup = BeautifulSoup(page, 'html.parser')
product = soup.find("div", {"class": "js-product-price"})
price = product.findNext('div',{'class':'js-price-display'}).getText().strip()
oos = product.findNext('p', attrs={'class': "price-oos"})
if oos is None:
oos = 'In Stock'
else:
oos = oos.getText()
val = url + "," + price + "," + oos + "," + time.ctime() + '\n'
ifile.write(val)
sys.stdout.write('1')
except Exception as e:
print e
#pass
return
ifile = open('output.csv', "a", 0)
ifile.write('URL' + "," + 'Price' + "," + 'Stock' + "," + "Time" + '\n')
inputs = csv.reader(open('input.csv'))
#inputs = csv.reader(codecs.open('input.csv', 'rU', 'utf-16'))
for i in inputs:
extract(i[0])
ifile.close()
print("finished")

Categories

Resources