Web Scrape - Can anyone assist in cleaning this up? - python

I'm fairly new when it comes to this and i've been working on this scrape of a webpage for a number of days now. I have been actively trying to avoid asking the question but i'm seriously stuck.
My Problems
The location I have the span loop currently positioned it prints all the prices for each listing, each time it runs through the "for product" loop. If I place it outside of this loop, it either prints the first in the list or the last in the list. How do I extract the price and print it along side each individual product.
I understand i've a lot of unused imports listed. These were just various avenues I was trying and yet to remove them.
The end goal here is to either push to json or csv file (also not currently written - but have a fair idea how to approach this aspect once have data.
from bs4 import BeautifulSoup
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib
import locale
import json
from selenium import webdriver
os.environ["PYTHONIOENCODING"] = "utf-8"
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
products = soup.find_all("div", "GC62 Product")
for product in products:
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents'+imglinkhref
#print(imgurl.replace('..', ''))
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price - not right
spans = browser.find_elements_by_css_selector("div.GC20.ProductPrice span")
for i in range(0,len(spans),2):
span = spans[i].text
print(span)
i+=1
print(titleText)
print(manuText)
print(descText)
print(moreText)
print(imgurl.replace('..', ''))
print("\n")
output:
£1,695.00
£1,885.00
£1,885.00
£2,015.00
£2,175.00
£2,175.00
£2,385.00
£2,115.00
£3,025.00
£3,315.00
£3,635.00
£3,925.00
£2,765.00
£3,045.00
£3,325.00
£3,615.00
£3,455.00
£2,815.00
£3,300.00
£3,000.00
£7,225.00
£7,555.00
£7,635.00
£7,015.00
£7,355.00
12G Beretta DT11 Trap Adjustable
beretta
Click on more for full details.
You may order this gun online, however due to UK Legislation we are not permitted to ship directly to you, we can however ship to a registered firearms dealer local to you. Once we receive your order, we will contact you to arrange which registered firearms dealer you would like the gun to be shipped to.
DT 11 Trap (Steelium Pro)
12
2 3/4"
10x10 rib
3/4&F
30"/32" weight; 4k
https://www.mcavoyguns.co.uk/contents/media/l_dt11_02.jpg

Comments from #baduker pointed me in the correct direction - indentations. Thanks!
import requests
import shutil
import csv
import pandas
from pandas import DataFrame
import re
import os
import urllib
import locale
import json
from selenium import webdriver
os.environ["PYTHONIOENCODING"] = "utf-8"
browser = webdriver.Chrome(executable_path='C:/Users/andrew.glass/chromedriver.exe')
browser.get("https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html")
URL = 'https://www.mcavoyguns.co.uk/contents/en-uk/d130_Beretta_Over___Under_Competeition_shotguns.html'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
products = soup.find_all("div", "GC62 Product")
for product in products:
#title
title = product.find("h3")
titleText = title.text if title else ''
#manufacturer name
manufacturer = product.find("div", "GC5 ProductManufacturer")
manuText = manufacturer.text if manufacturer else ''
#image location
img = product.find("div", "ProductImage")
imglinks = img.find("a") if img else ''
imglinkhref = imglinks.get('href') if imglinks else ''
imgurl = 'https://www.mcavoyguns.co.uk/contents'+imglinkhref
#print(imgurl.replace('..', ''))
#description
description = product.find("div", "GC12 ProductDescription")
descText = description.text if description else ''
#more description
more = product.find("div", "GC12 ProductDetailedDescription")
moreText = more.text if more else ''
#price - not right
spans = browser.find_elements_by_css_selector("div.GC20.ProductPrice span")
for i in range(0,len(spans),2):
span = spans[i].text
print(span)
i+=1
print(titleText)
print(manuText)
print(descText)
print(moreText)
print(imgurl.replace('..', ''))
print("\n")

Related

Python: Get element next to href

Python code:
url = 'https://www.basketball-reference.com/players/'
initial = list(string.ascii_lowercase)
initial_url = [url + i for i in initial]
html_initial = [urllib.request.urlopen(i).read() for i in initial_url]
soup_initial = [BeautifulSoup(i, 'html.parser') for i in html_initial]
tags_initial = [i('a') for i in soup_initial]
print(tags_initial[0][50])
Results example:
Shareef Abdur-Rahim
From the example above, I want to extract the name of the players which is 'Shareef Abdur-Rahim', but I want to do it for all the tags_initial lists,
Does anyone have an idea?
Could you modify your post by adding your code so that we can help you better?
Maybe that could help you :
name = soup.findAll(YOUR_SELECTOR)[0].string
UPDATE
import re
import string
from bs4 import BeautifulSoup
from urllib.request import urlopen
url = 'https://www.basketball-reference.com/players/'
# Alphabet
initial = list(string.ascii_lowercase)
datas = []
# URLS
urls = [url + i for i in initial]
for url in urls:
# Soup Object
soup = BeautifulSoup(urlopen(url), 'html.parser')
# Players link
url_links = soup.findAll("a", href=re.compile("players"))
for link in url_links:
# Player name
datas.append(link.string)
print("datas : ", datas)
Then, "datas" contains all the names of the players, but I advise you to do a little processing afterwards to remove some erroneous information like "..." or perhaps duplicates
There are probably better ways but I'd do it like this:
html = "a href=\"/teams/LAL/2021.html\">Los Angeles Lakers</a"
index = html.find("a href")
index = html.find(">", index) + 1
index_end = html.find("<", index)
print(html[index:index_end])
If you're using a scraper library it probably has a similar function built-in.

How can I scroll a particular section of a dynamic web page using selenium webdriver in python?

I have found many reference that scroll the entire webpage but I am looking for a particular section to scroll. I am working on marketwatch.com - section - latest news tab. How can I scroll just this latest news tab using selenium webdriver?
Below is my code which returns the heading of the news but keeps repeating same headings.
from bs4 import BeautifulSoup
import urllib
import csv
import time
from selenium import webdriver
count = 0
browser = webdriver.Chrome()
browser.get("https://www.marketwatch.com/newsviewer")
pageSource = browser.page_source
soup = BeautifulSoup(pageSource, 'lxml')
arkodiv = soup.find("ol", class_="viewport")
while browser.find_element_by_tag_name('ol'):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
div = list(arkodiv.find_all('div', class_= "nv-details"))
heading = []
Data_11 = list(soup.find_all("div", class_ = "nv-text-cont"))
datetime = list(arkodiv.find_all("li", timestamp = True))
for sa in datetime:
sh = sa.find("div", class_ = "nv-text-cont")
if sh.find("a", class_ = True):
di = sh.text.strip()
di = di.encode('ascii', 'ignore').decode('ascii')
else:
continue
print di
heading.append((di))
count = count+1
if 'End of Results' in arkodiv:
print 'end'
break
else:
continue
print count
That happens because the script you are executing scrolls to the bottom of the page.
To keep scrolling inside the element fetching news you need to replace this:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
with this:
browser.execute_script("document.documentElement.getElementsByClassName('viewport')[0].scrollTop = 999999")
EDIT
This is the complete working solution:
from bs4 import BeautifulSoup
import urllib
import csv
import time
from selenium import webdriver
count = 0
browser = webdriver.Chrome()
browser.get("https://www.marketwatch.com/newsviewer")
while browser.find_element_by_tag_name('ol'):
pageSource = browser.page_source
soup = BeautifulSoup(pageSource, 'lxml')
arkodiv = soup.find("ol", class_="viewport")
browser.execute_script("document.documentElement.getElementsByClassName('viewport')[0].scrollTop = 999999")
time.sleep(0.5)
div = list(arkodiv.find_all('div', class_= "nv-details"))
heading = set()
Data_11 = list(soup.find_all("div", class_ = "nv-text-cont"))
datetime = list(arkodiv.find_all("li", timestamp = True))
for sa in datetime:
sh = sa.find("div", class_ = "nv-text-cont")
if sh.find("a", class_ = True):
di = sh.text.strip()
di = di.encode('ascii', 'ignore').decode('ascii')
else:
continue
print di
heading.add((di))
count = count+1
if 'End of Results' in arkodiv:
print 'end'
break
else:
continue
print count
EDIT 2
You may also want to change how you store the headers, since the way you currently do keeps duplicates inside the list. Changed it to a set so that doesn't happen.

How can I parse querystring from webpage?

I'm trying to parse all the query string present in a page, so that using that query string I can navigate to specific page. code that I tried for doing this is as below
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import datetime
import dateutil.parser
import time
import pytz
"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
from espncricinfo.match import Match
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
"""----time-zone-calculation----"""
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")
"""------URL of page to parse-------with a date of today-----"""
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"
"""eg. url = http://www.espncricinfo.com/ci/engine/match/index.html?date=2018-02-12"""
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
""""------parsing for matchno------"""
match_no = [x['href'].split('/',4)[4].split('.')[0] for x in
soup.findAll('a', href=True, text='Scorecard')]
for p in match_no:
""" where p is a match no, e.g p = '1122282'"""
m = Match(p)
m.latest_batting
print(m.latest_batting)
when I print match_no I get output:
['8890/scorecard/1118760/andhra-vs-tamil-nadu-group-c-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118743/assam-vs-odisha-group-a-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118745/bengal-vs-delhi-group-b-vijay-hazare-trophy-2017-18/', '8890/scorecard/1118763/chhattisgarh-vs-vidarbha-group-d-vijay-hazare-trophy-2017-18/']
this page(http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today") contains all the match_no of games happening on that day, I want to trim this to get match_no which is 7 digit number[1118743,1118743.1118745....], how can I do this? SO using that match_no I can pass it to the Match() so I get details of particular match which happening at that day.
PS if no match is going on the new day then match_no returns none.
First, your code is very hard to read. You need to let your code breathe and make it appealing for others to read it.
Second, what is causing issue is probably this line:
match_no = [x['href'].split('/',4)[4].split('.')[0] for x in soup.findAll('a', href=True, text='Scorecard')]
It is hard to read too. There are far more better and readable ways of parsing match id from URL.
Here is example of what should be working. I did take provisional date for matches:
import re
import pytz
import requests
import datetime
from bs4 import BeautifulSoup
from espncricinfo.exceptions import MatchNotFoundError, NoScorecardError
from espncricinfo.match import Match
"""python espncricinfo library module https://github.com/dwillis/python-espncricinfo """
# from espncricinfo.match import Match
def get_match_id(link):
match_id = re.search(r'([0-9]{7})', link)
if match_id is None:
return None
return match_id.group()
# ----time-zone-calculation----
time_zone = pytz.timezone("Asia/Kolkata")
datetime_today = datetime.datetime.now(time_zone)
datestring_today = datetime_today.strftime("%Y-%m-%d")
# ------URL of page to parse-------with a date of today-----
url = "http://www.espncricinfo.com/ci/engine/match/index.html?date=datestring_today"
r = requests.get(url)
soup = BeautifulSoup(r.text, 'html.parser')
spans = soup.findAll('span', {"class": "match-no"})
matches_ids = []
for s in spans:
for a in s.findAll('a', href=lambda href: 'scorecard' in href):
match_id = get_match_id(a['href'])
if match_id is None:
continue
matches_ids.append(match_id)
# ------parsing for matchno------
for p in matches_ids:
# where p is a match no, e.g p = '1122282'
m = Match(p)
m.latest_batting
print(m.latest_batting)
Now, I didn't have every lib that you are using here, but this should give you an idea of how to do it.
Once again, my advice is that empty lines are your friends. They are reader's friends for sure. Make your code 'breathe'.

How to specify a range without limiting to numbers

This is the code that gets the restaurant reviews. I am collecting reviews.
The range is specified and imported. But there is a problem. Each store has a different number of reviews. Store with little reviews should go to next store soon.
I am suffering from too large a scope. But it can not reduce the scope. This is because some shops have reviews in this range.
How can I work effectively?
I saw find all?(element) searching this code. But I do not know if I've mistakenly applied my code.
#python3
import sys
from bs4 import BeautifulSoup
import urllib.request
import requests
from urllib.parse import quote
import time
import os
import xlwt
import random
import re
FISRT_URL = "https://www.yelp.com/search?
find_desc=Korean+Food&find_loc=Seattle,+WA&start="
LAST_URL = "&cflt=korean"
def get_link(URL, doc_name):
global jisu_i
global num
global page
for jisu_i in range(1):
current_page_num = 20 + jisu_i*10
position = URL.index('t=')
URL_with_page_num = URL[: position+2] + str(current_page_num) \
+ URL[position+2 :]
print(URL_with_page_num)
importurl = URL_with_page_num
r = requests.get(importurl)
soup = BeautifulSoup(r.content.decode('euc-kr','replace'), "lxml")
time.sleep(random.randint(10, 15))
for title in soup.find_all('h3')[page+2:21]:
page = page + 1
title_link = title.select('a')
for jisu_m in range(130):
print(page)
last_URL = title_link[0]['href']
print(last_URL)
first_URL = "https://www.yelp.com"
global article_URL
article_URL = first_URL + last_URL
time.sleep(random.randint(15, 30))
jisuurl = article_URL
for k in range(99): #
jisu_page_num = 0 + k * 20 #
position = jisuurl.index('?')
URL_with_page_num = jisuurl[: position + 1] + str("start=") + str(jisu_page_num)
jisu_with_page_num = URL_with_page_num
print(jisu_with_page_num)
jisu_importurl = jisu_with_page_num
get_text(URL, jisu_importurl, doc_name)
time.sleep(random.randint(40,180))
Yelp has a very well documented API here: https://www.yelp.com/developers/documentation/v3
This is the only reliable way of interacting with the site programatically.

handling pagination BeautifulSoup -Selenium

I have been struggling with this for over a week now. I am trying to learn Python and build something that will be useful to me at the sametime - something to help me find a new home to rent.
I have all the code working the way I want it - except I can not get all 550 properties, I can only get the first 25 on page 1. I have tried several methods but nothing seems to work.
If I use urlopen and take the main url + the regex of the page number "2_p/" for example I get a urlerror unknown url "h'.
If I use webdriver- firefox tries to look up www.h.com. I really could use a little help. Attached is my code- sorry it is kinda messy and probably a little long - I am just learning, so don't be to cruel.
from urllib.request import urlopen
from selenium import webdriver
from bs4 import BeautifulSoup
import csv
import datetime
from datetime import timedelta
import time
import re
pages1 = set()
next_page = ()
csv_output = [ ]
def getLinks(url):
global pages
driver = webdriver.Firefox()
driver.get(url)
time.sleep(3)
pageSource = driver.page_source
bsObj = BeautifulSoup(pageSource)
for addr_link in bsObj.findAll("a", href=re.compile("^/homedetails/*")):
if 'href' in addr_link.attrs:
if addr_link['href'] not in pages1:
newPage = addr_link.attrs['href']
pages1.add(newPage)
#print(newPage)
print(len(pages1))
for link in pages1:
property_url = link
getData(property_url)
def getData(url):
base_url = 'http://www.zillow.com'
final_url = base_url+url
html = urlopen(final_url)
bsObj = BeautifulSoup(html)
try:
# Gets Property Address
address = bsObj.find("header", {"class":"zsg-content-header addr"}).find("h1")
s_address = address.get_text()
print(address)
# Gets number of bedsrooms
beds = bsObj.find("span", {"class":"addr_bbs"})
s_beds = beds.get_text()
# Gets number of bathsrooms
baths = beds.find_next("span", {"class":"addr_bbs"})
s_baths = baths.get_text()
# Gets sqft
sqft = baths.find_next("span", {"class":"addr_bbs"})
s_sqft = sqft.get_text()
# Gets rent_amount
rent_amount = bsObj.find("div", {"class":"main-row home-summary-row"}).span
s_rent_amount = rent_amount.get_text()
# Gets rent_zestiment
zestiment_holder = bsObj.find("span", {"class":"zsg-tooltip-launch zsg-tooltip-launch_keyword"})
rent_zestiment = zestiment_holder.find_next("span")
s_rent_zestiment = rent_zestiment.get_text()
# Gets Date posted on Zillow
for time_posted in bsObj(text=re.compile("Posted")):
posted = time_posted.parent.get_text()
if 'hours' in posted:
date_on_zillow = datetime.date.today()
posted_date = date_on_zillow
else:
days_subtracted = int(re.search(r'\d+', posted).group())
posted_date = datetime.date.today()-datetime.timedelta(days=+days_subtracted)
# Gets Property subdivision
subdivision = bsObj.find(id="hdp-neighborhood").h2
s_subdivision = subdivision.get_text()
# Gets property_manager_name
property_manager_name = bsObj.find("span", {"class":"snl company-name"})
s_property_manager_name = property_manager_name.get_text()
# Gets property_manager_phone
property_manager_phone = bsObj.find("span", {"class":"snl phone"})
s_property_manager_phone = property_manager_phone.get_text()
# Gets disc_of_property
disc_of_property =(bsObj.find('div', {'class': "notranslate"}).text)
s_disc_of_property = disc_of_property.encode("utf-8")
# Gets url_of_listing so I can see Photos if interested
main_url = 'http://www.zillow.com'
url2 = url
listing_url = main_url+url2
s_listing_url = listing_url
except AttributeError as e:
return None
csv_data = [s_address, s_beds, s_baths, s_sqft, s_rent_amount, s_rent_zestiment, posted_date, s_subdivision, s_property_manager_name, s_property_manager_phone, s_disc_of_property, s_listing_url]
csv_output.append(csv_data)
resultFile = open("output.csv",'w+')
wr = csv.writer(resultFile)
wr.writerows(csv_output)
resultFile.close()
header = ['Address', 'Beds', 'Baths', 'Sqft', 'Rent Amount', 'rent Zestiment', 'Posted Date', 'Subdivision', 'Property Manager Name', 'Property Manager Phone', 'Disc of Property', 'URL']
csv_output.append(header)
getLinks("http://www.zillow.com/homes/for_rent/Jackson-County-MO/house,mobile_type/1804_rid/6m_days/39.371994,-93.635788,38.697836,-95.077744_rect/9_zm/")
Edit:
The 'while' that has been commented out was my last attempt at handling the pagination.

Categories

Resources