I'm brand new to python and figured I'd try to learn making a web scrape. So I'm trying to scrape a Newegg website for graphics cards, but seem to have some trouble with errors. All I want to do is grab the data and import it into a cvs file that I can view. but it seems that if I comment that out i get another error, I seem to not be able to figure it out. Any help is appreciated!
File "webScrape.py", line 32, in
price = price_container[0].text.strip("|")
IndexError: list index out of range
# import beautiful soup 4 and use urllib to import urlopen
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
# url where we will grab the product data
my_url = 'https://www.newegg.com/Product/ProductList.aspxSubmit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&ignorear=0&N=-1&isNodeId=1'
# open connection and grab the URL page information, read it, then close it
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# parse html from the page
page_soup = soup(page_html, "html.parser")
# find each product within the item-container class
containers = page_soup.findAll("div",{"class":"item-container"})
# write a file named products.csv with the data returned
filename = "products.csv"
f = open(filename, "w")
# create headers for products
headers = "price, product_name, shipping\n"
f.write("")
# define containers based on location on webpage and their DOM elements
for container in containers:
price_container = container.findAll("li", {"class":"price-current"})
price = price_container[0].text.strip("|")
title_container = container.findAll("a", {"class":"item-title"})
product_name = title_container[0].text
shipping_container = container.findAll("li",{"class":"price-ship"})
shipping = shipping_container[0].text.strip()
f.write(price + "," + product_name.replace(",", "|") + "," + shipping + "\n")
f.close()
You could write to a dataframe and that is easy to export to csv. I have added an additional class selector of .list-wrap to titles to ensure all lists are the same length.
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
def main():
url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=+graphics+cards&N=-1&isNodeId=1'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
prices = soup.select('.price-current')
titles = soup.select('.list-wrap .item-title')
shipping = soup.select('.price-ship')
items = list(zip(titles,prices, shipping))
results = [[title.text.strip(),re.search('\$\d+.\d+',price.text.strip()).group(0),ship.text.strip()] for title, price,ship in items]
df = pd.DataFrame(results,columns=['title', 'current price', 'shipping cost'])
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )
if __name__ == "__main__":
main()
Related
I am just messing around with BeautifulSoup and testing it on different websites after recently learning about it. I am currently trying to iterate through multiple pages instead of just the first page. I can append or write the information I am grabbing from any specific page that I desire but of course I would love to automate it.
This is my current code when trying to get it to work up to page five. Currently it only goes through the first webpage and writes the same info I am looking for to my excel file, five times. In my nested for loop I have some print statements just to see if it is working on the console before I even look in the file.
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
import unicodecsv as csv
f = open("on_sale_games.csv", "w", encoding='utf-8')
headers = "Game name, Original price, Final price, Percent off\n"
f.write(headers)
for i in range(5):
my_url = 'https://store.steampowered.com/specials#p={}&tab=TopSellers'.format(i+1)
uClient = uReq(my_url) # open up the url and download the page.
page_html = uClient.read() # reading the html page and storing the info into page_html.
uClient.close() # closing the page.
page_soup = soup(page_html, 'html.parser') # html parsing
containers = page_soup.findAll("a", {"class": "tab_item"})
for container in containers:
name_stuff = container.findAll("div", {"class": "tab_item_name"})
name = name_stuff[0].text
print("Game name:", name)
original_price = container.findAll("div", {"class": "discount_original_price"})
original = original_price[0].text
print("Original price:", original)
discounted_price = container.findAll("div", {"class": "discount_final_price"})
final = discounted_price[0].text
print("Discounted price:", final)
discount_pct = container.findAll("div", {"class": "discount_pct"})
pct = discount_pct[0].text
print("Percent off:", pct)
f.write(name.replace(':', '').replace("™", " ") + ',' + original + ',' + final + ',' + pct + '\n')
f.close()
Checking through the requests made by the browser, I noticed there's a request made in the background to fetch the data and get json result, you could work your way from there:
for i in range(5):
my_url = 'https://store.steampowered.com/contenthub/querypaginated/specials/NewReleases/render/?query=&start={}'.format(i*15)
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
data = json.loads(page_html)["results_html"]
page_soup = soup(data, 'html.parser')
# Rest of the code
It's like an API that get's 15 elements per page, so it's starts at 0, 15, 30 and so on.
I have created a scraping code to take information from a local newspaper site. I have two existing problems with the current code.
When it retrieves the paragraph data, and saves it to the CSV. it recognises "," as breaks and save the relevant data in the adjacent cell. How do I stop this from happening?
I want them to the scraped information in rows. i.e paragraph, title, weblink
Code below;
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://neweralive.na/today/"
ne_url = "https://neweralive.na/posts/"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
containers = page_soup.findAll("article", {"class": "post-item"})
filename = "newera.csv"
headers = "paragraph,title,link\n"
f = open(filename, "w")
f.write(headers)
for container in containers:
paragraph_container = container.findAll("p", {"class": "post-excerpt"})
paragraph = paragraph_container[0].text
title_container = container.findAll("h3", {"class": "post-title"})
title = title_container[0].text
weblink = ne_url + title_container[0].a["href"]
f.write(paragraph + "," + title + "," + weblink + "\n")
f.close()
You can use the built-in csv module to write well-formed CSV with quotes around strings that need those (e.g. those containing commas).
While at it, I refactored your code to use reusable functions:
get_soup_from_url() downloads an URL and gets a BeautifulSoup out of it
parse_today_page() is a generator function that can walk through that soup and return dicts of each article
The main code now just uses csv.DictWriter on the opened file; the dicts parsed are printed to the console for debugging ease and fed to the CSV writer for output.
from bs4 import BeautifulSoup
from urllib.request import urlopen
import csv
base_url = "https://neweralive.na/posts/"
def get_soup_from_url(url):
resp = urlopen(url)
page_soup = BeautifulSoup(resp.read(), "html.parser")
resp.close()
return page_soup
def parse_today_page(page_soup):
for container in page_soup.findAll("article", {"class": "post-item"}):
paragraph_container = container.findAll(
"p", {"class": "post-excerpt"}
)
paragraph = paragraph_container[0].text
title_container = container.findAll("h3", {"class": "post-title"})
title = title_container[0].text
weblink = base_url + title_container[0].a["href"]
yield {
"paragraph": paragraph,
"title": title.strip(),
"link": weblink,
}
print("Downloading...")
page_soup = get_soup_from_url("https://neweralive.na/today/")
with open("newera.csv", "w") as f:
writer = csv.DictWriter(f, ["paragraph", "title", "link"])
writer.writeheader()
for entry in parse_today_page(page_soup):
print(entry)
writer.writerow(entry)
The generated CSV ends up looking like e.g.
paragraph,title,link
"The mayor of Helao Nafidi, Elias Nghipangelwa, has expressed disappointment after Covid-19 relief food was stolen and sold by two security officers entrusted to guard the warehouse where the food was stored.","Guards arrested for theft of relief food",https://neweralive.na/posts/posts/guards-arrested-for-theft-of-relief-food
"Government has decided to construct 1 200 affordable homes, starting Thursday this week.","Govt to construct 1 200 low-cost houses",https://neweralive.na/posts/posts/govt-to-construct-1-200-low-cost-houses
...
You can use pandas module and convert dataframe-table to csv easily.
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import urlopen as uReq
page_url = "https://neweralive.na/today/"
ne_url = "https://neweralive.na/posts/"
uClient = uReq(page_url)
page_soup = soup(uClient.read(), "html.parser")
uClient.close()
containers = page_soup.findAll("article", {"class": "post-item"})
filename = "newera.csv"
rows = [] # Initialize list of list which is converted to dataframe.
for container in containers:
paragraph_container = container.findAll("p", {"class": "post-excerpt"})
paragraph = paragraph_container[0].text
title_container = container.findAll("h3", {"class": "post-title"})
title = title_container[0].text
weblink = ne_url + title_container[0].a["href"]
rows.append([paragraph, title, weblink]) # each row is appended
df = pd.DataFrame(rows, columns = ["paragraph","title","link"]) # col-name is headers
df.to_csv(filename, index=None)
I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.
The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.
Any help is much appreciated.
# Reviews Scrape
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'EXAMPLE.COM'
# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# HTML Parsing
page_soup = soup(page_html, "html.parser")
# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})
filename = "compreviews.csv"
f = open(filename, "w")
headers = "Score, Title, Content\n"
f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
# score = container.div.div.span["data-score"]
score = container.findAll("span",{"data-score":True})
user_score = score[0].text.strip()
title_review = container.findAll("b",{"class":"jdgm-rev__title"})
user_title = title_review[0].text.strip()
content_review = container.findAll("div",{"class":"jdgm-rev__body"})
user_content = content_review[0].text.strip()
print("user_score:" + score[0]['data-score'])
print("user_title:" + user_title)
print("user_content:" + user_content)
f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")
f.close()
The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:
import requests
from bs4 import BeautifulSoup as bs
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
Example dataframe to csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
authors = []
titles = []
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc
headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )
Is there something wrong with my loop statements or have i got the wrong tag? but when i check len(containers) it gives me 20 which means i have got the tag right but when i try to use a selector like i used p.text and strong.text stored into a variable and then write it into a file i get just only one element for product_name and one for price. kindly review
Here is my script:
from urllib.request import urlopen as Ureq
from bs4 import BeautifulSoup as soup
my_url = 'https://laptopparts.ca/collections/types?q=Accessories'
Uclient = Ureq(my_url)
page_html = Uclient.read()
Uclient.close()
page_soup = soup(page_html, "html.parser")
containers = page_soup.findAll("div",{"class":"grid__item large--one-quarter medium-down--one-half"})
filename = "products.csv"
f = open(filename, "w")
headers = "Title, Price\n"
f.write(headers)
for container in containers:
title_container = container.findAll("p")
product_name = title_container[0].text
price_container = container.findAll("strong")
price = price_container[0].text
print("product_name " + product_name)
print("price " + price)
f.write(product_name.replace(",","|") + "," + price + "\n")
f.close()
Give it a go. The below script should fetch you all the item titles and price and write those in a csv file accordingly. I tried to shake off some extraneous lines which made your script look horrible.
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
res = urlopen('https://laptopparts.ca/collections/types?q=Accessories').read()
soup = BeautifulSoup(res, "html.parser")
with open("output.csv","w",newline="") as infile:
writer = csv.writer(infile)
writer.writerow(['Title','Price'])
for container in soup.find_all("div",{"class":"large--one-quarter"}):
title = container.find("p").text
price = container.find("strong").text
print(title,price)
writer.writerow([title,price])
I am parsing an HTMl page and am having a hard time figuring out how to pull a certain 'p' tag without a class or on id. I am trying to reach the tag of 'p' with the lat and long. Here is my current code:
import bs4
from urllib import urlopen as uReq #this opens the URL
from bs4 import BeautifulSoup as soup #parses/cuts the html
my_url = 'http://www.fortwiki.com/Battery_Adair'
print(my_url)
uClient = uReq(my_url) #opens the HTML and stores it in uClients
page_html = uClient.read() # reads the URL
uClient.close() # closes the URL
page_soup = soup(page_html, "html.parser") #parses/cuts the HTML
containers = page_soup.find_all("table")
for container in containers:
title = container.tr.p.b.text.strip()
history = container.tr.p.text.strip()
lat_long = container.tr.table
print(title)
print(history)
print(lat_long)
Link to website: http://www.fortwiki.com/Battery_Adair
The <p> tag you're looking for is very common in the document, and it doesn't have any unique attributes, so we can't select it directly.
A possible solution would be to select the tag by index, as in bloopiebloopie's answer.
However that won't work unless you know the exact position of the tag.
Another possible solution would be to find a neighbouring tag that has distinguishing attributes/text and select our tag in relation to that.
In this case we can find the previous tag with text: "Maps & Images", and use find_next to select the next tag.
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
b = soup.find('b', text='Maps & Images')
if b:
lat_long = b.find_next().text
This method should find the coordinates data in any www.fortwiki.com page that has a map.
You can use re to match partial text inside a tag.
import re
import requests
from bs4 import BeautifulSoup
url = 'http://www.fortwiki.com/Battery_Adair'
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
lat_long = soup.find('p', text=re.compile('Lat:\s\d+\.\d+\sLong:')).text
print(lat_long)
# Lat: 24.5477038 Long: -81.8104541
I am not exactly sure what you want but this works for me. There are probably neeter ways of doing it. I am new to python
soup = BeautifulSoup(requests.get("http://www.fortwiki.com/Battery_Adair").content, "html.parser")
x = soup.find("div", id="mw-content-text").find("table").find_all("p")[8]
x = x.get_text()
x = x.split("Long:")
lat = x[0].split(" ")[1]
long = x[1]
print("LAT = " + lat)
# LAT = 24.5477038
print("LNG = " + long)
# LNG = -81.8104541