I am just messing around with BeautifulSoup and testing it on different websites after recently learning about it. I am currently trying to iterate through multiple pages instead of just the first page. I can append or write the information I am grabbing from any specific page that I desire but of course I would love to automate it.
This is my current code when trying to get it to work up to page five. Currently it only goes through the first webpage and writes the same info I am looking for to my excel file, five times. In my nested for loop I have some print statements just to see if it is working on the console before I even look in the file.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import unicodecsv as csv
f = open("on_sale_games.csv", "w", encoding='utf-8')
headers = "Game name, Original price, Final price, Percent off\n"
f.write(headers)
for i in range(5):
my_url = 'https://store.steampowered.com/specials#p={}&tab=TopSellers'.format(i+1)
uClient = uReq(my_url) # open up the url and download the page.
page_html = uClient.read() # reading the html page and storing the info into page_html.
uClient.close() # closing the page.
page_soup = soup(page_html, 'html.parser') # html parsing
containers = page_soup.findAll("a", {"class": "tab_item"})
for container in containers:
name_stuff = container.findAll("div", {"class": "tab_item_name"})
name = name_stuff[0].text
print("Game name:", name)
original_price = container.findAll("div", {"class": "discount_original_price"})
original = original_price[0].text
print("Original price:", original)
discounted_price = container.findAll("div", {"class": "discount_final_price"})
final = discounted_price[0].text
print("Discounted price:", final)
discount_pct = container.findAll("div", {"class": "discount_pct"})
pct = discount_pct[0].text
print("Percent off:", pct)
f.write(name.replace(':', '').replace("™", " ") + ',' + original + ',' + final + ',' + pct + '\n')
f.close()
Checking through the requests made by the browser, I noticed there's a request made in the background to fetch the data and get json result, you could work your way from there:
for i in range(5):
my_url = 'https://store.steampowered.com/contenthub/querypaginated/specials/NewReleases/render/?query=&start={}'.format(i*15)
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
data = json.loads(page_html)["results_html"]
page_soup = soup(data, 'html.parser')
# Rest of the code
It's like an API that get's 15 elements per page, so it's starts at 0, 15, 30 and so on.
Related
I'm brand new to python and figured I'd try to learn making a web scrape. So I'm trying to scrape a Newegg website for graphics cards, but seem to have some trouble with errors. All I want to do is grab the data and import it into a cvs file that I can view. but it seems that if I comment that out i get another error, I seem to not be able to figure it out. Any help is appreciated!
File "webScrape.py", line 32, in
price = price_container[0].text.strip("|")
IndexError: list index out of range
# import beautiful soup 4 and use urllib to import urlopen
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
# url where we will grab the product data
my_url = 'https://www.newegg.com/Product/ProductList.aspxSubmit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&ignorear=0&N=-1&isNodeId=1'
# open connection and grab the URL page information, read it, then close it
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# parse html from the page
page_soup = soup(page_html, "html.parser")
# find each product within the item-container class
containers = page_soup.findAll("div",{"class":"item-container"})
# write a file named products.csv with the data returned
filename = "products.csv"
f = open(filename, "w")
# create headers for products
headers = "price, product_name, shipping\n"
f.write("")
# define containers based on location on webpage and their DOM elements
for container in containers:
price_container = container.findAll("li", {"class":"price-current"})
price = price_container[0].text.strip("|")
title_container = container.findAll("a", {"class":"item-title"})
product_name = title_container[0].text
shipping_container = container.findAll("li",{"class":"price-ship"})
shipping = shipping_container[0].text.strip()
f.write(price + "," + product_name.replace(",", "|") + "," + shipping + "\n")
f.close()
You could write to a dataframe and that is easy to export to csv. I have added an additional class selector of .list-wrap to titles to ensure all lists are the same length.
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
def main():
url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=+graphics+cards&N=-1&isNodeId=1'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
prices = soup.select('.price-current')
titles = soup.select('.list-wrap .item-title')
shipping = soup.select('.price-ship')
items = list(zip(titles,prices, shipping))
results = [[title.text.strip(),re.search('\$\d+.\d+',price.text.strip()).group(0),ship.text.strip()] for title, price,ship in items]
df = pd.DataFrame(results,columns=['title', 'current price', 'shipping cost'])
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )
if __name__ == "__main__":
main()
Is there something wrong with my loop statements or have i got the wrong tag? but when i check len(containers) it gives me 20 which means i have got the tag right but when i try to use a selector like i used p.text and strong.text stored into a variable and then write it into a file i get just only one element for product_name and one for price. kindly review
Here is my script:
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup
my_url = 'https://laptopparts.ca/collections/types?q=Accessories'
Uclient = Ureq(my_url)
page_html = Uclient.read()
Uclient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"grid__item large--one-quarter medium-down--one-half"})
filename = "products.csv"
f = open(filename, "w")
headers = "Title, Price\n"
f.write(headers)
for container in containers:
title_container = container.findAll("p")
product_name = title_container[0].text
price_container = container.findAll("strong")
price = price_container[0].text
print("product_name " + product_name)
print("price " + price)
f.write(product_name.replace(",","|") + "," + price + "\n")
f.close()
Give it a go. The below script should fetch you all the item titles and price and write those in a csv file accordingly. I tried to shake off some extraneous lines which made your script look horrible.
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
res = urlopen('https://laptopparts.ca/collections/types?q=Accessories').read()
soup = BeautifulSoup(res, "html.parser")
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(['Title','Price'])
for container in soup.find_all("div",{"class":"large--one-quarter"}):
title = container.find("p").text
price = container.find("strong").text
print(title,price)
writer.writerow([title,price])
I am new to web scraping and for practice I am trying to web scrape a website and turn the results into a csv file. When I come to the part to turn the results into a csv file, it doesn't put the address in the address column. I want the data to go into the address column. The code is as follows.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.allagents.co.uk/find-agent/london/'
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, 'html.parser')
containers = page_soup.findAll('div', {'class':'itemlabel3'})
filename = "webscrape.csv"
f = open(filename, "w")
headers = "Company Name, Address, Telephone Number\n"
f.write(headers)
for container in containers:
comp_name = container.find('div', {'class':'labelleft2 col-md-
10'}).div.h4.a.text
address = container.find('div', {'class':'labelleft2 col-md-
10'}).div.p.text
tel = container.find('div', {'class':'labelleft2 col-md-
10'}).div.find('p', {'style':'clear: both; margin-bottom:
15px;'}).strong.text
print("Company Name:", comp_name)
print("Address:", address)
print("Telephone", tel)
f.write(comp_name.replace(",", ("|")) + "," + address.replace(",", ("|")) +
"," + tel + "\n")
f.close()
Any help is appreciated. Thanks you in advance.
it seems like in your address data new line character is present
try to replace below line for address in your code and try running again
address=(container.find('div', {'class':'labelleft2 col-md-10'}).div.p.text).replace('\n','')
Im am trying to scrape developer jobs from indeed.nl to Excel using Python and bs4. Everything works fine but when I open it in Excel, there are extra lines cells between the jobs
Can anyone see what I did wrong?
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'https://www.indeed.nl/jobs?q=developer&l='
# opening up connection, grabbing the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
#grabs each job
containers = page_soup.findAll("div",{"class":"row"})
filename = "indeedjobs.csv"
f = open(filename, "w")
headers = "Company; Job; City\n"
f.write(headers)
for container in containers:
jobtitle = container.a["title"]
city_container = container.findAll("span",{"class":"location"})
City_name = city_container[0].text
company_container = container.findAll("span",{"class":"company"})
company_name = company_container[0].text
print("Company: " + company_name)
print("Job: " + jobtitle)
print("City: " + City_name)
f.write(company_name + ";" + jobtitle + ";" + City_name + "\n")
f.close()
The <span class="company"> element starts with a newline and some spaces. Remove those with .strip().
You can also consider the csv module to write well-formatted CSV files. The module will help you with correct escaping of special characters.
i am writing some text in csv file using python..
Here is the screenshot of that in which way i am getting write data in file.
You can see that in Channel Social Media Links column all the links are writing well in other next line cell but first link is not writing in Channel Social Media Links column. Please how can i write like this?
My python script is here
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
myUrl='https://www.youtube.com/user/HolaSoyGerman/about'
uClient = uReq(myUrl)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("h1",{"class":"branded-page-header-title"})
filename="Products2.csv"
f = open(filename,"w")
headers = "Channel Name,Channel Description,Channel Social Media Links\n"
f.write(headers)
channel_name = containers[0].a.text
print("Channel Name :" + channel_name)
# For About Section Info
aboutUrl='https://www.youtube.com/user/HolaSoyGerman/about'
uClient1 = uReq(aboutUrl)
page_html1 = uClient1.read()
uClient1.close()
page_soup1 = soup(page_html1, "html.parser")
description_div = page_soup.findAll("div",{"class":"about-description
branded-page-box-padding"})
channel_description = description_div[0].pre.text
print("Channel Description :" + channel_description)
f.write(channel_name+ "," +channel_description)
links = page_soup.findAll("li",{"class":"channel-links-item"})
for link in links:
social_media = link.a.get("href")
f.write(","+","+social_media+"\n")
f.close()
It would help if you made use of Python's CSV library when writing to your file. This is able to convert a list of items into correctly comma separated values.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
myUrl = 'https://www.youtube.com/user/HolaSoyGerman/about'
uClient = uReq(myUrl)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("h1",{"class":"branded-page-header-title"})
filename = "Products2.csv"
with open(filename, "w", newline='') as f:
csv_output = csv.writer(f)
headers = ["Channel Name", "Channel Description", "Channel Social Media Links"]
csv_output.writerow(headers)
channel_name = containers[0].a.text
print("Channel Name :" + channel_name)
# For About Section Info
aboutUrl = 'https://www.youtube.com/user/HolaSoyGerman/about'
uClient1 = uReq(aboutUrl)
page_html1 = uClient1.read()
uClient1.close()
page_soup1 = soup(page_html1, "html.parser")
description_div = page_soup.findAll("div",{"class":"about-description branded-page-box-padding"})
channel_description = description_div[0].pre.text
print("Channel Description :" + channel_description)
links = [link.a.get('href') for link in page_soup.findAll("li",{"class":"channel-links-item"})]
csv_output.writerow([channel_name, channel_description, links[0]])
for link in links[1:]:
csv_output.writerow(['', '', link])
This would give you a single row with each of the hrefs in the last column, for example:
Channel Name,Channel Description,Channel Social Media Links
HolaSoyGerman.,Los Hombres De Verdad Usan Pantuflas De Perrito,http://www.twitter.com/germangarmendia
,,http://instagram.com/germanchelo
,,http://www.youtube.com/juegagerman
,,http://www.youtube.com/juegagerman
,,http://www.twitter.com/germangarmendia
,,http://instagram.com/germanchelo
,,https://plus.google.com/108460714456031131326
Each writerow() call will write a list of values to the file as comma separated values and automatically add the newline for you at the end. All that is needed is to build the lists of values for each row. First of all take the first of your links and make it the last value in the list after your channel description. Secondly, write a row for each remaining link where the first two columns have blank values.
To answer your comment, the following should get you started:
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import csv
def get_data(url, csv_output):
if not url.endswith('/about'):
url += '/about'
print("URL: {}".format(url))
uClient = uReq(url)
page_html = uClient.read()
uClient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("h1", {"class":"branded-page-header-title"})
channel_name = containers[0].a.text
print("Channel Name :" + channel_name)
description_div = page_soup.findAll("div", {"class":"about-description branded-page-box-padding"})
channel_description = description_div[0].pre.text
print("Channel Description :" + channel_description)
links = [link.a.get('href') for link in page_soup.findAll("li", {"class":"channel-links-item"})]
csv_output.writerow([channel_name, channel_description, links[0]])
for link in links[1:]:
csv_output.writerow(['', '', link])
#TODO - get list of links for the related channels
return related_links
my_url = 'https://www.youtube.com/user/HolaSoyGerman'
filename = "Products2.csv"
with open(filename, "w", newline='') as f:
csv_output = csv.writer(f)
headers = ["Channel Name", "Channel Description", "Channel Social Media Links"]
csv_output.writerow(headers)
for _ in range(5):
next_links = get_data(my_url, csv_output)
my_url = next_links[0] # e.g. follow the first of the related links