I'm trying to scrape url for elements to have them be outputted to a file but it doesn't seem to work -The function runs fine to have them printed
def get_price():
page = requests.get(url, headers=HEADERS)
soup = BeautifulSoup(page.content, 'lxml')
while True:
try:
product_title = soup.find("span",{'id': "productTitle"}).text.strip()
#price = float((soup.find("span",{'class': "a-offscreen"}).text.split("₹")[1]).replace(',',""))
f = open("Textfile.txt", "w")
f.write(product_title)
f.close()
break
except AttributeError:
get_price()
break
get_price()
Related
Hello I'm a beginner to python and programming in general, and I was wondering how I would make the outputted data a list. I used bs to extract data from a table and attempt to make a list with the data, but I end up only adding the first number to the list. Can someone provide me assistance and an explaination?
from bs4 import BeautifulSoup
from requests_html import HTMLSession
s = HTMLSession()
url = 'https://www.timeanddate.com/weather/usa/new-york/ext'
def get_data(url):
r = s.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
return soup
with open('document.txt', 'a') as f:
f.write(str(get_data(url)))
with open('document.txt', 'r') as html_file:
contents = html_file.read()
soup = BeautifulSoup(contents, 'lxml')
forecast_table = soup.find('table', class_ = 'zebra tb-wt fw va-m tb-hover')
wtitle = soup.title.text
print(wtitle)
print("------")
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
print(pl_high_final)
get_weather_high(forecast_table)
This the output. Instead of each line being a number, I want to have it all under on list
Create a list before your for loop and just append your data instead of printing it and then just print the list after the for loop
data = []
def get_weather_high(forecast_table):
print("Weather Highs:")
for high in forecast_table.find_all('tbody'):
rows1 = high.find_all('tr')
for row1 in rows1:
pl_high = row1.find_all('td')
pl_high = [td.text.strip() for td in pl_high]
pl_high = pl_high[1:2]
for pl_high_final in pl_high:
pl_high_final = pl_high_final[0:3]
data.append(pl_high_final)
print(data) # or return data if you need it some where else
When I try to write the information in the csv file, error is thrown:
Traceback (most recent call last):
File "sizeer.py", line 68, in <module>
writer.writerow([name,color,price])
ValueError: I/O operation on closed file
import requests
import csv
from bs4 import BeautifulSoup
proxies = {
"http":"http://195.189.60.97:3128",
"http":"http://103.78.75.165:8080",
"http":"http://212.87.220.2:3128",
"http":"http://88.99.134.61:8080",
"http":"http://103.102.139.178:8080",
"http":"http://218.60.8.83:3129",
"http":"http://124.121.105.193:8888",
"http":"http://198.237.114.54:8080",
"http":"http://36.67.106.58:8080",
"http":"http://35.214.241.28:3128"
}
base_url = ...
page = requests.get(base_url, proxies=proxies)
if page.status_code != 200:
exit("Page wasn't parsed")
soup = BeautifulSoup(page.content, 'lxml')
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
#Get categories
category_wrapper = soup.find_all(class_="m-menu_subItem")
categories = []
for cw in category_wrapper:
anchor = cw.find("a", recursive=False)
categories.append(anchor['href'])
#Iterrate categories
for category in categories:
cat_page = requests.get(base_url + category, proxies=proxies)
cat_soup = BeautifulSoup(cat_page.content, 'lxml')
products_wrapper = cat_soup.find(class_="b-productList")
cat_pagination = products_wrapper.find(class_="m-pagination").find_all("span")
max_page = [int(s) for s in cat_pagination[-1].text.split() if s.isdigit()][0]
#Iterrate category with pagination and get products
for i in range(1, max_page+1):
cat_pagination_page = requests.get(base_url+category+"/?sort=default&limit=60&page="+str(i), proxies=proxies)
cat_pagination_page_soup = BeautifulSoup(cat_pagination_page.content, 'lxml')
product_links = cat_pagination_page_soup.find_all(class_="b-itemList_photoLink")
for link in product_links:
#Get product data
product_page = requests.get(base_url+link['href'], proxies=proxies)
product_soup = BeautifulSoup(product_page.content, 'lxml')
#Get product variations
variations = product_soup.find_all(class_="m-productDescr_colorItem")
#If there are variations
if len(variations) > 0:
for v in variations:
variation_page = requests.get(base_url+v['href'], proxies=proxies)
variation_soup = BeautifulSoup(variation_page.content, 'lxml')
price = variation_soup.find(class_="s-newPrice").text.strip().split(" ")[0]
name = variation_soup.find(class_="m-productDescr_headline").text.strip()
color = v['title']
print(name)
print(color)
print(price)
print("-------------")
#Save in csv
writer.writerow([name,color,price])
print("SCRAPING DONE")
How to keep the file open through the whole script execution ? Or I have to open it every time I am adding content ? EDIT In fact, the file is not even created.
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
The file closes at the end of the with block - that is the block's purpose.
You could put everything inside the block, but that only makes the existing problem worse: the code is reaching several levels of indents, is long and becomes difficult to understand. This is why you use functions to organize the code. For example, if you have the big for loop set in a function:
def do_stuff_with(categories, writer):
for category in categories:
# lots of logic here
# use `writer.writerow` when needed
# Get everything else set up that doesn't need the file, first
categories = ... # do the BeautifulSoup input stuff
# then we can open the file and use the function:
with open("result.csv", "w") as file:
writer = csv.writer(file)
writer.writerow(["Product","Color","Price"])
do_stuff_with(categories, writer)
Once you have that working, you can probably think of ways to apply the technique further. For example, pull out the innermost logic, for handling the variations for a single product. Or you can have a function to handle the creation of the categories data, and return it.
I hope you're well. Could you please tell why I can use my scraping script properly :)
It works with other website. I'm a beginner so I probably made a basic mistake
import requests
from bs4 import BeautifulSoup
import time
import csv
links = []
for i in range(1):
url = '*******/recettes/?page={}' + str(i)
res = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})
response = requests.get(url)
print(response)
if response.ok:
print('Page: ' + str(i))
soup = BeautifulSoup(response.text, "html.parser")
divs = soup.findAll('div', class_ = 'field-item even')
for div in divs:
a = div.find('a')
link = a['href']
links.append('*******' + link)
time.sleep(3)
print(len(links))
with open('urls3.txt', 'w') as file:
for link in links:
file.write(link + '\n')
"""
with open('urls3.txt', 'r') as inf:
with open('recipes3.csv', 'w') as outf:
outf.write('titre,image,url,\n')
for row in inf:
url = row.strip()
response = requests.get(url)
if response.ok:
soup = BeautifulSoup(response.text, "html.parser")
titre = soup.find('h1')
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
print(titre.text, image, url)
outf.write(str(titre.text) + ',' + str(image) + ',' + str(url) + '\n')
time.sleep(1)
"""
Could you please tell me why there is a mistake here :
<Response [200]>
Page: 0
Traceback (most recent call last):
File "ex3.py", line 18, in <module>
link = a['href']
TypeError: 'NoneType' object is not subscriptable
I've found the answer, I post it here :) for anyone interested
try:
image = soup.find('img', {"id":"recipe-media-viewer-thumbnail-1"})['src']
except Exception as e:
image = None
I am trying to copy all the data within an HTML page that has the certain class "chapter_header_styling" with BS4.
This was working when I manually inputed the URL – but is tedious when there are multiple books and various chapters. So I then created another script that would generate all the chapter URLs for the book and combine them into a text file bchap.txt (book chapters).
Since then I have altered the file and added various break points so ignore my lack of comments and unused arrays/list. I have narrowed it down to the ###Comment## where it doesn't work. It's probably not nested right but I'm not sure... I had this working to a point but can't figure out why it won't paste the mydivs data into the book.html file. If anyone with more experience could point me in the right direction much would be appreciated.
#mkbook.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org/content/FAC2017"
pop = ""
#z = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
#print (chap)
#pop = ""
pop = LINK+chap
#print (pop)
r = requests.get(pop)
data = r.text
#print(data)
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", {"class": ["annotator", "chapter_header_styling"]})
f = open("BOOK.html","a")
f.write("test <br/>")
########################################
#MY PROBLEM IS BELOW NOT PRINTING DIV DATA INTO TXT FILE
########################################
for div in mydivs:
print (div)
z = str(div)
print(z) #doesn't printout...why???
f.write(z)
print len(mydivs)
f.close()
chapters.close()
##############################################
## this is the old mkbook.py code before I looped it - inputing url 1 # time
#
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017/preface")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
mydivs = soup.findAll("div",{"class":["annotator",
"chapter_header_styling"]})
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
print len(mydivs) #outputs 1 if copied div data.
#######################################
#mkchap.py
# coding: utf-8
from bs4 import BeautifulSoup
import requests
r = requests.get("https://codes.iccsafe.org/content/FAC2017")
data = r.text
soup = BeautifulSoup(data, 'html.parser')
a = []
soup.findAll('option',{"value":True})
list = soup.findAll('option')
with open('bchap.txt', 'w') as filehandle:
for l in list:
filehandle.write(l['value'])
filehandle.write("\n")
print l['value']
#with open('bchap.txt', 'w') as filehandle:
# filehandle.write("%s\n" % list)
filehandle.close()
The problem seems to be that you are constructing your url using a wrong base url.
LINK = "https://codes.iccsafe.org/content/FAC2017"
If you take a look at your 1st request you can see this clearly.
print(pop)
print(r.status_code)
Outputs:
https://codes.iccsafe.org/content/FAC2017/content/FAC2017
404
After running the code to populate bchap.txt, its output is
/content/FAC2017
/content/FAC2017/legend
/content/FAC2017/copyright
/content/FAC2017/preface
/content/FAC2017/chapter-1-application-and-administration
/content/FAC2017/chapter-2-scoping-requirements
/content/FAC2017/chapter-3-building-blocks
/content/FAC2017/chapter-4-accessible-routes
/content/FAC2017/chapter-5-general-site-and-building-elements
/content/FAC2017/chapter-6-plumbing-elements-and-facilities
/content/FAC2017/chapter-7-communication-elements-and-features
/content/FAC2017/chapter-8-special-rooms-spaces-and-elements
/content/FAC2017/chapter-9-built-in-elements
/content/FAC2017/chapter-10-recreation-facilities
/content/FAC2017/list-of-figures
/content/FAC2017/fair-housing-accessibility-guidelines-design-guidelines-for-accessible-adaptable-dwellings
/content/FAC2017/advisory
Lets change the base url first and try again.
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop)
print(pop)
print(r.status_code)
chapters.close()
Outputs:
https://codes.iccsafe.org/content/FAC2017
404
...
why? b'coz of the \n. If we do a
print(repr(pop))
It will output
'https://codes.iccsafe.org/content/FAC2017\n'
You'll have to strip away that \n also. The final code that worked is
from bs4 import BeautifulSoup
import requests
LINK = "https://codes.iccsafe.org"
pop = ""
chapters = open("bchap.txt",'r')
a = []
for aline in chapters:
chap = aline
pop = LINK+chap
r = requests.get(pop.strip())
data = r.text
soup = BeautifulSoup(data, 'html.parser')
mydivs = soup.findAll("div", class_="annotator chapter_header_styling")
f = open("BOOK.html","a")
for div in mydivs:
z = str(div)
f.write(z)
f.close()
chapters.close()
Only the first result is being written to a csv, with one letter of the url per row. This is instead of all urls being written, one per row.
What am I not doing right in the last section of this code that is causing the cvs to be written only with one of the results instead of all of them?
import requests
from bs4 import BeautifulSoup
import csv
def grab_listings():
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/2/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/3/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/4/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/5/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/6/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/7/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/8/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/9/")
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class":"wlt_search_results"})
for elem in l_area.findAll("a", {"class":"frame"}):
return elem["href"]
l = grab_listings()
with open ("gyms.csv", "wb") as file:
writer = csv.writer(file)
for row in l:
writer.writerow(row)
So I refactored your code a bit and i think it should work as you would expect it now:
import requests
from bs4 import BeautifulSoup
import csv
def grab_listings(page_idx):
ret = []
url = ("http://www.gym-directory.com/listing-category/gyms-fitness-centres/"
"page/{}/").format(page_idx) # the index of the page will be inserted here
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class": "wlt_search_results"})
for elem in l_area.findAll("a", {"class": "frame"}):
# be sure to add all your results to a list and return it,
# if you return here then you will only get the first result
ret.append(elem["href"])
return ret
def main():
l = [] # this will be a list of lists
# call the function 9 times here with idx from 1 till 9
for page_idx in range(1, 10):
l.append(grab_listings(page_idx))
print l
with open("gyms.csv", "wb") as f:
writer = csv.writer(f)
for row in l:
# be sure that your row is a list here, if it is only
# a string all characters will be seperated by a comma.
writer.writerow(row)
# for writing each URL in one line separated by commas at the end
# with open("gyms.csv", "wb") as f:
# for row in l:
# string_to_write = ',\n'.join(row)
# f.write(string_to_write)
if __name__ == '__main__':
main()
I added some comments to the code and hope it is explanatory enough. If not just ask :)
Simplified:
import requests
from bs4 import BeautifulSoup
import csv
def grab_listings():
for i in range(0, 5):
url = "http://www.gym-directory.com/listing-category/gyms-fitness-centres/page/{}/"
r = requests.get(url.format(i + 1))
soup = BeautifulSoup(r.text, 'html.parser')
l_area = soup.find("div", {"class": "wlt_search_results"})
for elem in l_area.findAll("a", {"class": "frame"}):
yield elem["href"]
l = grab_listings()
with open("gyms.csv", "w") as file:
writer = csv.writer(file)
for row in l:
writer.writerow(row)