Python HTML Parser Pagination - python

I'm new to python and have managed to get this far trying the HTML Parser, but I'm stuck on how to get pagination for the reviews at the bottom of the page to work for the site.
The URL is in the PasteBin code, I am leaving out the URL in this thread for privacy reasons.
Any help is much appreciated.
# Reviews Scrape
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
my_url = 'EXAMPLE.COM'
# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# HTML Parsing
page_soup = soup(page_html, "html.parser")
# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})
filename = "compreviews.csv"
f = open(filename, "w")
headers = "Score, Title, Content\n"
f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
# score = container.div.div.span["data-score"]
score = container.findAll("span",{"data-score":True})
user_score = score[0].text.strip()
title_review = container.findAll("b",{"class":"jdgm-rev__title"})
user_title = title_review[0].text.strip()
content_review = container.findAll("div",{"class":"jdgm-rev__body"})
user_content = content_review[0].text.strip()
print("user_score:" + score[0]['data-score'])
print("user_title:" + user_title)
print("user_content:" + user_content)
f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")
f.close()

The page does an xhr GET request using a query string to get results. This query string has parameters for reviews per page and page number. You can make an initial request with what seems like the max reviews per page of 31, extract the html from the json returned then grab the page count; write a loop to run over all pages getting results. Example construct below:
import requests
from bs4 import BeautifulSoup as bs
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
print([i.text for i in soup.select('.jdgm-rev__author')])
print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
Example dataframe to csv
import requests
from bs4 import BeautifulSoup as bs
import pandas as pd
start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'
authors = []
titles = []
with requests.Session() as s:
r = s.get(start_url).json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])
for page in range(2, total_pages + 1):
r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
soup = bs(r['html'], 'lxml')
authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc
headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )

Related

Is there a way to extract only the string of a paragraph when webscraping in python?

We want to know if there is a way to extract only the string of a paragraph when web scraping in python?
The problem lies in our definition of 'price'
Anastacia <3
Code:
import requests
from bs4 import BeautifulSoup
def scraper(url):
url.status_code
url.headers
c = url.content
soup = BeautifulSoup(c, "html.parser")
samples2 = soup.find_all(class_="product-price font-weight-bold mb-0")
for p in samples2:
price = (p)
print(price)
print()
url = "https://www.bevco.dk/spiritus/"
url = requests.get(url)
scraper(url)
You only have to modify the variable price. Extract the text from between the tags and for esthetic purposes, strip the extracted string.
import requests
from bs4 import BeautifulSoup
def scraper(url):
url.status_code
url.headers
c = url.content
soup = BeautifulSoup(c, "html.parser")
samples2 = soup.find_all(class_="product-price font-weight-bold mb-0")
for p in samples2:
price = (p)
print(price.get_text().strip())
print()
url = "https://www.bevco.dk/spiritus/"
url = requests.get(url)
scraper(url)
import requests
from bs4 import BeautifulSoup
def getdata(url):
r = requests.get(url)
return r.text
htmldata = getdata("https://www.bevco.dk/spiritus")
soup = BeautifulSoup(htmldata, 'html.parser')
data = ''
for data in soup.find_all("p",{'class': 'product-price font-weight-bold mb-0'}):
print(data.get_text()) `enter code here`

How can Beautifulsoup scrape the pages inside this list of hyperlinks?

I am trying to scrape the contents of the hyperlinks on the left side of this page. I am already able to scrape the contents of the hyperlinks, so now I am trying to run the script on each individual hyperlink that is on the left side of the page.
URL: https://bitinfocharts.com/top-100-richest-dogecoin-addresses-3.html
I think what needs to be done is the url be a dynamic variable, and that variable is a loop which will go through all of the hyperlinks in the URL above. Although I'm not exactly sure if this is the best way to approach it, as this is my first project
Any advice is greatly appreciated.
Here is the code that I am trying to plug this into.
import csv
import requests
from bs4 import BeautifulSoup as bs
url = 'https://bitinfocharts.com/dogecoin/address/DN5Hp2kCkvCsdwr5SPmwHpiJgjKnC5wcT7'
headers = {"User-Agent": "Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
headers = []
datarows = []
#Get crypto address for the filename
item = soup.find('h1').text
newitem = item.replace('Dogecoin','')
finalitem = newitem.replace('Address','')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
A simple way would be to make an initial request and extract all the links in the second column of the table.
Then loop those links, make requests, and continue with your existing code, except to also handle cases where no table present.
import csv
import requests
from bs4 import BeautifulSoup as bs
headers = []
datarows = []
with requests.Session() as s:
s.headers = {"User-Agent": "Safari/537.36"}
r = s.get('https://bitinfocharts.com/top-100-richest-dogecoin-addresses-3.html')
soup = bs(r.content, 'lxml')
address_links = [i['href'] for i in soup.select('.table td:nth-child(2) > a')]
for url in address_links:
r = s.get(url)
soup = bs(r.content, 'lxml')
table = soup.find(id="table_maina")
if table:
item = soup.find('h1').text
newitem = item.replace('Dogecoin','')
finalitem = newitem.replace('Address','')
for row in table.find_all('tr'):
heads = row.find_all('th')
if heads:
headers = [th.text for th in heads]
else:
datarows.append([td.text for td in row.find_all('td')])
fcsv = csv.writer(open(f'{finalitem}.csv', 'w', newline=''))
fcsv.writerow(headers)
fcsv.writerows(datarows)
else:
print('no table for: ', url)

Python Beautiful Soup Scraping, Newegg

I'm brand new to python and figured I'd try to learn making a web scrape. So I'm trying to scrape a Newegg website for graphics cards, but seem to have some trouble with errors. All I want to do is grab the data and import it into a cvs file that I can view. but it seems that if I comment that out i get another error, I seem to not be able to figure it out. Any help is appreciated!
File "webScrape.py", line 32, in
price = price_container[0].text.strip("|")
IndexError: list index out of range
# import beautiful soup 4 and use urllib to import urlopen
import bs4
from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup
# url where we will grab the product data
my_url = 'https://www.newegg.com/Product/ProductList.aspxSubmit=ENE&DEPA=0&Order=BESTMATCH&Description=graphics+card&ignorear=0&N=-1&isNodeId=1'
# open connection and grab the URL page information, read it, then close it
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()
# parse html from the page
page_soup = soup(page_html, "html.parser")
# find each product within the item-container class
containers = page_soup.findAll("div",{"class":"item-container"})
# write a file named products.csv with the data returned
filename = "products.csv"
f = open(filename, "w")
# create headers for products
headers = "price, product_name, shipping\n"
f.write("")
# define containers based on location on webpage and their DOM elements
for container in containers:
price_container = container.findAll("li", {"class":"price-current"})
price = price_container[0].text.strip("|")
title_container = container.findAll("a", {"class":"item-title"})
product_name = title_container[0].text
shipping_container = container.findAll("li",{"class":"price-ship"})
shipping = shipping_container[0].text.strip()
f.write(price + "," + product_name.replace(",", "|") + "," + shipping + "\n")
f.close()
You could write to a dataframe and that is easy to export to csv. I have added an additional class selector of .list-wrap to titles to ensure all lists are the same length.
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
def main():
url = 'https://www.newegg.com/Product/ProductList.aspx?Submit=ENE&DEPA=0&Order=BESTMATCH&Description=+graphics+cards&N=-1&isNodeId=1'
res = requests.get(url)
soup = BeautifulSoup(res.content, "lxml")
prices = soup.select('.price-current')
titles = soup.select('.list-wrap .item-title')
shipping = soup.select('.price-ship')
items = list(zip(titles,prices, shipping))
results = [[title.text.strip(),re.search('\$\d+.\d+',price.text.strip()).group(0),ship.text.strip()] for title, price,ship in items]
df = pd.DataFrame(results,columns=['title', 'current price', 'shipping cost'])
df.to_csv(r'C:\Users\User\Desktop\Data.csv', sep=',', encoding='utf-8',index = False )
if __name__ == "__main__":
main()

Access Hidden Data on a page

I need to access the following website: http://mothoq.com/store/22
scroll down till i see the phone icon.
click on it, and scrape the phone number.
I have successfully connected to the website, and able to scrape all data needed, except of the phone number.
I have tried to use
soup.find_all('p',attrs={"align":"center"})
my code is:
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "html5lib")
results = soup.find('div', attrs={'id': 'subtitle'})
for storeData in results:
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
for storeContact in contacts:
storePhone = soup.find_all('p', attrs={"align":"center"})
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"})['href']
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"})['href']
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"})['href']
print(storePhone)
Thanks!
You should search for hidden div with id="store-telephone-form" and take second
<p> tag from it.
import requests
import pandas as pd
from bs4 import BeautifulSoup
records = []
storeId = 22
url = "http://mothoq.com/store/" + str(storeId)
r = requests.get(url)
content = r.text
soup = BeautifulSoup(content, "lxml")
results = soup.find('div', attrs={'id': 'subtitle'})
storeName = soup.find('h1')
url = soup.find('font').text
contacts = soup.find_all('p', attrs={"class":"store_connect_details"})
try:
storePhone = soup.find('div', attrs={"id":"store-telephone-form"}).select('p')[1].text
storeTwitter = soup.find('a', attrs={"class":"connect_icon_twitter"}).get('href')
storeFacebook = soup.find('a', attrs={"class":"connect_icon_facebook"}).get('href')
storeLinkedin = soup.find('a', attrs={"class":"connect_icon_linkedin"}).get('href')
except:
pass
print(storePhone)

Problems retrieving information from imdb

I'm trying to get the movie titles from an imdb watchlist. This is my code:
import requests, bs4
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
print(soup.find_all('.lister-item-header'))
Even though '.lister-item-header' exists in the chrome developer console it doesn't exist in the html file that the requests module downloaded. I've also tried using regular expressions. What would be the best way of retrieving the titles?
You should select elements by their class in this way.
import requests
import bs4
url = 'http://www.imdb.com/chart/top'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
rows = soup.select('.titleColumn > a')
for row in rows:
print(row.text)
Or you can do it in this way:
import requests
import bs4
url = 'http://www.imdb.com/chart/top'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
rows = soup.find_all('td', class_='titleColumn')
for row in rows:
print(row.a.text)
The data is load from a json object which is embedded into the raw html file, so we can parse it and get the title.
import requests
import bs4
import json
url = 'http://www.imdb.com/user/ur69187878/watchlist?ref_=wt_nv_wl‌​_all_1'
res = requests.get(url)
res.raise_for_status()
soup = bs4.BeautifulSoup(res.text, "html.parser")
# rows = soup.find_all('h3', class_='list-item-header')
js_elements = soup.find_all('script')
js_text = None
search_str = 'IMDbReactInitialState.push('
for element in js_elements:
text = element.text
if search_str in text:
js_text = text.strip()
break
json_start = js_text.index(search_str) + len(search_str)
json_text = js_text[json_start:-2]
json_obj = json.loads(js_text[json_start:-2])
for title in json_obj['titles']:
json_title = json_obj['titles'][title]
print(json_title['primary']['title'])
But I have to say that this is not a general method to attack this kind of problems, if you wanna have a general solution for all pages whose data is loaded from json or api, you can use some other ways such as Selenium.

Categories

Resources