Using an if loop to exclude entries containing a substring - python

I'm trying to scrape the results & statistics from the last 4 seasons of snooker from https://cuetracker.net.
I succeeded (somewhat) in scraping most of the data however I neglected to recognise that there are walkovers included. These walkovers contain only nationality,player name and score data and no stats and therefore as I have scraped my data in lists they do not correctly align when I convert them into a DataFrame.
I am trying to use a if in statement in my loop in order to try and skip these matches that are walkovers and therefore realign the correct stats with the correct match. I'm trying to use the word 'Walkover' to stop the item being appended to my list.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.support.select import Select
from bs4 import BeautifulSoup
import os
import re
import time
import pandas as pd
def wait_for_page_load():
timer = 15
start_time = time.time()
page_state = None
while page_state != 'complete':
time.sleep(0.5)
page_state = browser.execute_script('return document.readyState;')
if time.time() - start_time > timer:
raise Exception('Timeout :(')
chrome_path = r"C:\Users\George\Desktop\chromedriver.exe"
browser = webdriver.Chrome(chrome_path)
page_source = browser.page_source
browser.get("https://cuetracker.net/seasons")
links = browser.find_elements_by_css_selector("table.table.table-striped a")
hrefs=[]
for link in links:
hrefs.append(link.get_attribute("href"))
hrefs = hrefs[1:2]
hrefs2 = []
for href in hrefs:
browser.get(href)
wait_for_page_load()
links2 = browser.find_elements_by_xpath('.//tr/td[2]/a')
for link in links2:
hrefs2.append((link.get_attribute("href")))
player_1_nationality = []
player_1_name = []
player_1_score = []
player_2_score = []
player_2_nationality = []
player_2_name = []
date_played = []
player_1_points_scored = []
player_2_points_scored = []
player_1_fifty_plus_breaks = []
player_2_fifty_plus_breaks = []
match_progress = []
player_1_points_per_frame = []
player_2_points_per_frame = []
for href in hrefs2:
browser.get(href)
wait_for_page_load()
list_1_nationality= browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_nationality:
player_1_nationality.append(lis.get_attribute("alt"))
list_1_player = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/a')
for li in list_1_player:
player_1_name.append(li.get_attribute('text'))
list_2_nationality = browser.find_elements_by_xpath('.//div/div[2]/div[3]/img')
for nationality_2 in list_2_nationality:
player_2_nationality.append(nationality_2.get_attribute("alt"))
list_2_name = browser.find_elements_by_xpath('.//div/div[2]/div[3]/a')
for name_2 in list_2_name:
player_2_name.append(name_2.get_attribute('text'))
list_1_score = browser.find_elements_by_xpath('.//div/div[2]/div[2]/span[1]/b')
for score in list_1_score:
player_1_score.append(score.get_attribute('innerText'))
list_2_score = browser.find_elements_by_xpath('.//div/div[2]/div[2]/span[3]')
for score_2 in list_2_score:
player_2_score.append(score_2.get_attribute('innerText'))
#list_date_played = browser.find_elements_by_xpath('.//div[4]/div[2]/div/div')
#for date in list_date_played:
#date_played.append(date.get_attribute('innerText'))
page_source = browser.page_source
soup = BeautifulSoup(page_source, 'lxml')
points_scored_elem = soup.find_all('div', text='Points Scored')
for elem in points_scored_elem:
player_1_points_scored.append(elem.find_next('div').find_next('div').find_next('div').get_text())
points_scored_2_elem = soup.find_all('div', text='Points Scored')
for elem in points_scored_2_elem:
player_2_points_scored.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
fifty_plus_breaks_elem = soup.find_all('div', text='50+ Breaks')
for elem in fifty_plus_breaks_elem:
player_1_fifty_plus_breaks.append(elem.find_next('div').find_next('div').find_next('div').get_text())
fifty_plus_breaks_2_elem = soup.find_all('div', text='50+ Breaks')
for elem in fifty_plus_breaks_2_elem:
player_2_fifty_plus_breaks.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
match_progress_elem = soup.find_all('div', text='Match progress')
for elem in match_progress_elem:
match_progress.append(elem.find_next('div').find_next('div').find_next('div').get_text(strip=True))
points_per_frame_elem = soup.find_all('div', text='points/frame')
for elem in points_scored_elem:
player_1_points_per_frame.append(elem.find_next('div').find_next('div').find_next('div').get_text())
points_per_frame_2_elem = soup.find_all('div', text='Avg. points/frame')
for elem in points_scored_elem:
player_2_points_per_frame.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
points_scored_2_elem = soup.find_all('div', text='Avg. points/frame')
for elem in points_scored_2_elem:
player_2_points_per_frame.append(elem.find_next('div').find_next('div').find_next('div').find_next('div').get_text())
list_date_played = soup.find_all('div', text='Played on')
for date in list_date_played:
date_played.append(date.find_next('div').find_next('div').find_next('div').get_text(strip=True))
The above code is the bulk of my scraping code. Below is what I'm trying to test in order to skip the walkovers. I'm trying to use the parent of the img as it includes the 'Walkover' string within it's innerText.
for href in hrefs2:
browser.get(href)
wait_for_page_load()
page_source = browser.page_source
list_3_nationality = browser.find_elements_by_xpath('.//div/div[2]/div[1]/b/img')
for lis in list_1_nationality:
check = lis.find_element_by_xpath(("./parent::b"))
check = check.get_attribute('innerText')
if 'Walkover' in check:
continue
else:
player_1_nationality.append(lis.get_attribute('alt'))
For some reason this doesn't seem to be working?

Related

Scraped data is not saving to csv file as it keeps returning a blank csv file

My scraper is calling the website and hitting each of the 44 pages and creating a csv file but the csv file is empty. I am returning after each of the functions and saving the data to a csv at the end of the scraper.
Can anyone see what is wrong with my code?
Code:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
`
I have tried the above code.
The first problem I found in your code is that the job_ls is an empty list, i.e. soup.findAll("div",{"class":"information"}) doesn't find anything.
Moreover, job_postings contains only one webelement (i.e. the first job of the list) instead of all 10 jobs shown in the page, that's because you used .find_element instead of .find_elements. As a result of these and other problems, process_page(URL_BASE) returns an empty dataframe.
In this case you can speed up the process and use less code using directly selenium instead of bs4
driver.get(URL_BASE)
driver.implicitly_wait(30)
MAX_PAGE = 4
titles, locations, descriptions = [], [], []
for i in range(MAX_PAGE):
print('current page:',i+1,end='\r')
titles += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > span[role=heading]')]
locations += [loc.text.replace('\n',', ') for loc in driver.find_elements(By.CSS_SELECTOR, '.information > p[class=job-info]')]
descriptions += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > p[data-ph-at-id=jobdescription-text')]
if i < MAX_PAGE-1:
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
else:
break
df = pandas.DataFrame({'title':titles,'location':locations,'description':descriptions})
df.to_csv(SAVE_FILENAME, index=False)
and df will be something like

Waiting for data table to load after click / Selenium

I am trying to read the data table from the Indian Central Pollution Controal Board using selenium/python. Here is an example of output.
I am essentially following the approach presented here: https://github.com/RachitKamdar/Python-Scraper.
Thanks to #Prophet, I was able to read data from the first page (Select element using XPATH with Python?) but I cannot get selenium to wait for the data table to reload when switching to page 2.
I tried to add a webdriverwait instruction but this does seem to work. Any help would be greatly appreciated. Thanks
Here is what I tried to do
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
browser.find_element(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
#this works ok for page 1
#this does not wait after the click for the data table to update. As a result res is wrong for page 2 [empty].
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
...
i = i + 1
Update 1:
Following Prophet's suggestion, I made the following modification:
browser.find_element_by_tag_name("select").send_keys("100")
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
print(maxpage)
i = 1
while i < maxpage + 1:
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
browser.find_element(By.XPATH,'//a[#class="paginate_button next"]').click()
This still crashes on page 2 (data is empty). In addition, data should contain 100 items from page 1 but only contains 10. The maxpage number is correct (15).
Update 2:
here is the whole script after incorporating Prophet's recommendations [original script follows https://github.com/RachitKamdar/Python-Scraper].
This only retrieves 10 points from the first page and fails to switch to the next page.
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
def getValsHtml(table):
data = []
heads = table.find_all('th')
data.append([ele.text.strip() for ele in heads])
rows = table.find_all('tr')
for row in rows:
cols = row.find_all('td')
cols = [ele.text.strip() for ele in cols]
data.append([ele for ele in cols]) # Get rid of empty values
data.pop(1)
data = pd.DataFrame(data[1:],columns = data[0])
return data
def parameters(br,param):
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").send_keys(param)
br.find_elements_by_class_name("pure-checkbox")[1].click()
br.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
timeout = 60
url = 'https://app.cpcbccr.com/ccr/#/caaqm-dashboard-all/caaqm-landing/data'
chdriverpath="/net/f1p/my_soft/chromedriver"
option = webdriver.ChromeOptions()
browser = webdriver.Chrome(executable_path="{}".format(chdriverpath), chrome_options=option)
browser.get(url)
station="Secretariat, Amaravati - APPCB"
state="Andhra Pradesh"
city="Amaravati"
sd=['01', 'Jan', '2018']
ed=['31', 'Dec', '2021']
duration="24 Hours"
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.CLASS_NAME,"toggle")))
browser.find_elements_by_class_name("toggle")[0].click()
browser.find_element_by_tag_name("input").send_keys(state)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[1].click()
browser.find_element_by_tag_name("input").send_keys(city)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[2].click()
browser.find_element_by_tag_name("input").send_keys(station)
browser.find_element_by_class_name("options").click()
browser.find_elements_by_class_name("toggle")[4].click()
browser.find_element_by_class_name("filter").find_element_by_tag_name("input").send_keys(duration)
browser.find_element_by_class_name("options").click()
browser.find_element_by_class_name("c-btn").click()
for p in ['NH3']:
print(p)
try:
parameters(browser,p)
except:
print("miss")
browser.find_element_by_class_name("list-filter").find_element_by_tag_name("input").clear()
pass
browser.find_element_by_class_name("wc-date-container").click()
browser.find_element_by_class_name("month-year").click()
browser.find_element_by_id("{}".format(sd[1].upper())).click()
browser.find_element_by_class_name("year-dropdown").click()
browser.find_element_by_id("{}".format(int(sd[2]))).click()
browser.find_element_by_xpath('//span[text()="{}"]'.format(int(sd[0]))).click()
browser.find_elements_by_class_name("wc-date-container")[1].click()
browser.find_elements_by_class_name("month-year")[1].click()
browser.find_elements_by_id("{}".format(ed[1].upper()))[1].click()
browser.find_elements_by_class_name("year-dropdown")[1].click()
browser.find_element_by_id("{}".format(int(ed[2]))).click()
browser.find_elements_by_xpath('//span[text()="{}"]'.format(int(ed[0])))[1].click()
browser.find_elements_by_tag_name("button")[-1].click()
next_page_btn_xpath = '//a[#class="paginate_button next"]'
actions = ActionChains(browser)
#This is how you should treat the Select drop down
select = Select(browser.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#class="dataTables_wrapper no-footer"]')))
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH, "//*[#id='DataTables_Table_0_paginate']/span/a")))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = browser.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()
browser.quit()
Instead of
browser.find_element(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a[contains(text(),'{}')]".format(i)).click()
Try clicking on this element:
browser.find_element(By.XPATH,'//a[#class="paginate_button next"]').click()
It's simply next page button and it fill not change per page you are on.
Also, instead of
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.ID,"DataTables_Table_0_wrapper")))
Try this
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#class="dataTables_wrapper no-footer"]')))
This element will be same for all the pages while that you trying to use defined for first page only.
UPD
The correct code should be like this:
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
next_page_btn_xpath = '//a[#class="paginate_button next"]'
actions = ActionChains(driver)
#This is how you should treat the Select drop down
select = Select(driver.find_element_by_tag_name("select"))
select.select_by_value('100')
WebDriverWait(browser, timeout).until(EC.visibility_of_element_located((By.XPATH,'//div[#class="dataTables_wrapper no-footer"]')))
maxpage = int(browser.find_elements(By.XPATH,"//*[#id='DataTables_Table_0_paginate']/span/a")[-1].text)
i = 1
while i < maxpage + 1:
res = browser.page_source
soup = BeautifulSoup(res, 'html.parser')
soup = soup.find(id = 'DataTables_Table_0')
if i == 1:
data = getValsHtml(soup)
else:
data = data.append(getValsHtml(soup))
print(i)
print(data)
i = i + 1
#scroll to the next page btn and then click it
next_page_btn = driver.find_element_by_xpath(next_page_btn_xpath)
actions.move_to_element(next_page_btn).perform()
browser.find_element(By.XPATH,next_page_btn).click()

Page loop not working on python webscrape

I'm quite new to python and have written a script using beautifulsoup to parse a website table. I've tried everything but can't get the loop to cycle through pages. It currently just repeats the data on the first page 8 times (number of pages).
Can anyone please help?
Code:
import requests
from bs4 import BeautifulSoup
first_year = 2020
last_year = 2020
for i in range(last_year-first_year+1):
year = str(first_year + i)
print("Running for year:", year)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID="+year).text
soup = BeautifulSoup(text, "html.parser")
options = soup.findAll("option")
opts = []
for option in options:
if not option['value'].startswith("20") and not option['value'].startswith("19") and option["value"]:
opts.append({option["value"]: option.contents[0]})
for opt in opts:
for key, value in opt.items():
print("Doing option:", value)
text = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID=" + year + "&Round=" + key).text
pages_soup = BeautifulSoup(text, "html.parser")
p = pages_soup.findAll("a")
pages = 8
if "&Page=" in str(p[-2]):
pages = int(p[-2].contents[0])
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(text, "html.parser")
tbody = pages_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))
Thankyou.
Try this..
for j in range(pages):
print("Page {}/{}".format(str(j+1), str(pages)))
parse = requests.get("https://finalsiren.com/AFLPlayerStats.asp?SeasonID={}&Round={}&Page={}".format(year, key, j+1)).text
p_soup = BeautifulSoup(parse, "html.parser")
tbody = p_soup.findAll("tbody")
tbody_soup = BeautifulSoup(str(tbody), "html.parser")
tr = tbody_soup.findAll("tr")
for t in tr:
t = str(t).replace("</tr>", "").replace("<tr>", "").replace("amp;", "")
t = t[4:len(t)-5].split('</td><td>')
t.append(str(j+1))
t.append(str(value))
t.append(str(year))
open("output.csv", "a").write("\n" + ";".join(t))

how do i find a div element by searching in selenium, and then copying an attribute from that div using selenium and python?

what i am trying to do is getting the asin (an attribute) from a div (element) in html to then concatenate with amazon.com/dp/ + asin to form a url which is to then be visited. the divs have no id but are identified by the data-index="1" attribute within the div element so i am wondering how to call this div element to then be specifically searched for the asin attribute. thanks for reading.
using python 3.7 and selenium webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
email = ('.')
password = ('.')
query = ('macbook')
urls = []
prices = []
names = []
descs = []
def search_amazon(query):
driver.get('https://amazon.com/')
searchBox = driver.find_element_by_id('twotabsearchtextbox')
time.sleep(2)
searchBox.send_keys(query)
searchBox.send_keys(Keys.ENTER)
time.sleep(3)
firstResult = driver.find_element_by_name('data-index="1"')
asin = firstResult.getAttribute('data-asin')
print(asin)
url = 'https://amazon.com/dp/' + asin
driver.get(url)
print(url)
return url
search_amazon(query)
You need to change these two lines of code with the code I have provided.
firstResult = driver.find_element_by_name('data-index="1"')
asin = firstResult.getAttribute('data-asin')
Since data-index is not the name its an attribute.You can use following css selector.
firstResult = driver.find_element_by_css_selector('div[data-index="1"]>div')
asin = firstResult.get_attribute('data-asin')
Here is the working code.
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
email = ('.')
password = ('.')
query = ('macbook')
urls = []
prices = []
names = []
descs = []
def search_amazon(query):
driver.get('https://amazon.com/')
searchBox = driver.find_element_by_id('twotabsearchtextbox')
time.sleep(2)
searchBox.send_keys(query)
searchBox.send_keys(Keys.ENTER)
time.sleep(3)
firstResult = driver.find_element_by_css_selector('div[data-index="1"]>div')
asin = firstResult.get_attribute('data-asin')
print(asin)
url = 'https://amazon.com/dp/' + asin
driver.get(url)
print(url)
return url
search_amazon(query)

How can I scroll a particular section of a dynamic web page using selenium webdriver in python?

I have found many reference that scroll the entire webpage but I am looking for a particular section to scroll. I am working on marketwatch.com - section - latest news tab. How can I scroll just this latest news tab using selenium webdriver?
Below is my code which returns the heading of the news but keeps repeating same headings.
from bs4 import BeautifulSoup
import urllib
import csv
import time
from selenium import webdriver
count = 0
browser = webdriver.Chrome()
browser.get("https://www.marketwatch.com/newsviewer")
pageSource = browser.page_source
soup = BeautifulSoup(pageSource, 'lxml')
arkodiv = soup.find("ol", class_="viewport")
while browser.find_element_by_tag_name('ol'):
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(0.5)
div = list(arkodiv.find_all('div', class_= "nv-details"))
heading = []
Data_11 = list(soup.find_all("div", class_ = "nv-text-cont"))
datetime = list(arkodiv.find_all("li", timestamp = True))
for sa in datetime:
sh = sa.find("div", class_ = "nv-text-cont")
if sh.find("a", class_ = True):
di = sh.text.strip()
di = di.encode('ascii', 'ignore').decode('ascii')
else:
continue
print di
heading.append((di))
count = count+1
if 'End of Results' in arkodiv:
print 'end'
break
else:
continue
print count
That happens because the script you are executing scrolls to the bottom of the page.
To keep scrolling inside the element fetching news you need to replace this:
browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
with this:
browser.execute_script("document.documentElement.getElementsByClassName('viewport')[0].scrollTop = 999999")
EDIT
This is the complete working solution:
from bs4 import BeautifulSoup
import urllib
import csv
import time
from selenium import webdriver
count = 0
browser = webdriver.Chrome()
browser.get("https://www.marketwatch.com/newsviewer")
while browser.find_element_by_tag_name('ol'):
pageSource = browser.page_source
soup = BeautifulSoup(pageSource, 'lxml')
arkodiv = soup.find("ol", class_="viewport")
browser.execute_script("document.documentElement.getElementsByClassName('viewport')[0].scrollTop = 999999")
time.sleep(0.5)
div = list(arkodiv.find_all('div', class_= "nv-details"))
heading = set()
Data_11 = list(soup.find_all("div", class_ = "nv-text-cont"))
datetime = list(arkodiv.find_all("li", timestamp = True))
for sa in datetime:
sh = sa.find("div", class_ = "nv-text-cont")
if sh.find("a", class_ = True):
di = sh.text.strip()
di = di.encode('ascii', 'ignore').decode('ascii')
else:
continue
print di
heading.add((di))
count = count+1
if 'End of Results' in arkodiv:
print 'end'
break
else:
continue
print count
EDIT 2
You may also want to change how you store the headers, since the way you currently do keeps duplicates inside the list. Changed it to a set so that doesn't happen.

Categories

Resources