Scrape dynamic data from a table with python, BeautifulSoup, Selenium - python

I would like to scrape all the url links associated with the soccer games included in the table in this website.
Here is the code:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
url = 'https://www.coteur.com/cotes-foot.php'
driver.get(url)
fixture1 = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div/div[2]/div/table/tbody/tr[3]/td[3]/a")
print(fixture1.text)
links = []
i = 3
while i <= 6:
fixture = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div/div[2]/div/table/tbody/tr[" + str(i) + "]/td[3]/a")
links.append(fixture)
i = i + 3
print(links)
driver.close()
When I scrape one match it returns the data I'm expecting. However, when I tried to make a loop to get all the soccer games I run into a problem.
Here is the result of the code:
Betis Seville - Granada 74 Cf
[<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="0199958a-4d31-4a21-9856-8f8c3cc8ee05", element="158fcdaf-501f-41a4-9550-8a42543acc22")>, <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="0199958a-4d31-4a21-9856-8f8c3cc8ee05", element="74e67896-fccb-48da-8eef-bbf8d9a6f3b3")>]
I wanted to get the first element, but I don't get what I was expecting.

This works well
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.coteur.com/cotes-foot.php")
links = driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]')
data = [l.text for l in links]
print(data)

I tried your code, here is the result :
File "./coteur2.py", line 17
data = [l.text for l in links]
^
IndentationError: unexpected indent
I prefer to use this way :
links = driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]')
n = 0
while n < len(links):
links[n] = links[n].text
n = n + 1
print(links)
Thanks for your help

Related

selenium Instagram scraper duplication

I am trying to scrap Instagram by hash tag in this case dog using selenium
scroll to load images
get links of posts for loaded images
but I realized that most of the links are repeated (last 3 lines) I don't know what is the problem I even tried many libraries for Instagram scrapping but all of them either giving errors or don't search by hash tag.
I am trying to scrap Instagram to get image data for my Deep Learning classifier model
also I want to know if there are better methods for Instagram scraping
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains as AC
driver = webdriver.Edge("msedgedriver.exe")
driver.get("https://www.instagram.com")
tag = "dog"
numberOfScrolls = 70
### Login Section ###
time.sleep(3)
username_field = driver.find_element_by_xpath('//*[#id="loginForm"]/div/div[1]/div/label/input')
username_field.send_keys("myusername")
password_field = driver.find_element_by_xpath('//*[#id="loginForm"]/div/div[2]/div/label/input')
password_field.send_keys("mypassword")
time.sleep(1)
driver.find_element_by_xpath('//*[#id="loginForm"]/div/div[3]').click()
time.sleep(5)
### Scarping Section ###
link = "https://www.instagram.com/explore/tags/" + tag
driver.get(link)
time.sleep(5)
Links = []
for i in range(numberOfScrolls):
AC(driver).send_keys(Keys.END).perform() # scrolls to the bottom of the page
time.sleep(1)
for x in range(1, 8):
try:
row = driver.find_element_by_xpath(
'//*[#id="react-root"]/section/main/article/div[2]/div/div[' + str(i) + ']')
row = row.find_elements_by_tag_name("a")
for element in row:
if element.get_attribute("href") is not None:
print(element.get_attribute("href"))
Links.append(element.get_attribute("href"))
except:
continue
print(len(Links))
Links = list(set(Links))
print(len(Links))
it found what was my mistake
row=driver.find_element_by_xpath('//[#id="reactroot"]/section/main/article/div[2]/div/div[' + str(i) + ']')
specifically in this part str(i) it should be x instead of i thats why most of them where repeated

How can I scrape all links in vector of webpages?

I need to scrape all the attrs src from all the iframe containing 'fibalivestats.com' found inside the game links.
The initial url is fixture that stores all the links of the games.
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import chromedriver_binary
from bs4 import BeautifulSoup
options = webdriver.ChromeOptions()
options.add_argument('headless')
browser = webdriver.Chrome(options=options)
browser.get('https://www.argentina.basketball/tfb/fixture/de/division-buenos-aires')
partidos = browser.find_elements_by_tag_name('a')
for partido in partidos:
if partido.get_attribute('href').find('https://www.argentina.basketball/tfb/partido/') == 0:
hrefs.append(partido.get_attribute("href"))
Then I separate the iframes doing a 'for' loop for the href that interests me, but some don't take all the links (this happens randomly)
hrefs=[]
fibalive=[]
errores=[]
j = 0
a = 0
for href in hrefs:
browser.get(href)
try:
if browser.find_element_by_tag_name('iframe').get_attribute('src').find('https://www.fibalivestats.com/') == 0:
fibalive.append(browser.find_element_by_tag_name('iframe').get_attribute('src'))
except NoSuchElementException as exception:
j = j + 1
file = open('D:/Desktop/Partidos/partidos'+str(j)+'.txt',"w")
file.write(browser.page_source)
file.close()
When generating a .txt from the page_source of the links that generate the exception, the html is as follows:
<html><head></head><body></body></html>

Building data for POST method on .aspx website in Python

I'm new to .NET and Python, but I would like to make a program to scrape .aspx site and work with content there (HTML code is enough). I tried some libraries in Python, but all I got is the first page of that site. Seems like I am building wrong POST data, I don't know the right form of the data, what should be included and what not.
http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018
import requests, urllib, urllib2
r = requests.get("http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018")
content = r.text
print content
start_index = content.find('id="__VIEWSTATE"') + 24
sliced_vs = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__VIEWSTATEGENERATOR"') + 33
sliced_vsg = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__VIEWSTATEENCRYPTED"') + 33
sliced_vse = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__EVENTVALIDATION"') + 30
sliced_EV = content[start_index:content.find('"',start_index)]
form_data = {'__EVENTTARGET': 'gvZverejnenie',
'__EVENTARGUMENT': 'Page$2',
'__VIEWSTATE': sliced_vs,
'__VIEWSTATEGENERATOR': sliced_vsg,
'__VIEWSTATEENCRYPTED': sliced_vse,
'__EVENTVALIDATION': sliced_EV}
data_encoded = urllib.urlencode(form_data)
r = requests.post('http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018',data=data_encoded)
content = r.text
print content
For example, in code I want to get the second page ('Page$2'). I get always the same result, but with different values for ViewState and EventValidation. Where could be the problem, please?
This code requires selenium and chromedriver to control Google Chrome. Turns out there are 476 pages total (following the url you provided).
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018')
with open('page_1.html', 'w') as f:
f.write(driver.page_source)
page_num = 2
while True:
try:
element = driver.find_element_by_link_text(str(page_num))
except NoSuchElementException:
elements = driver.find_elements_by_link_text('...')
if len(elements) == 0:
break # less than 11 pages total
elif len(elements) == 1 and page_num > 12:
break # last page
element = elements[-1]
element.click()
with open('page_{}.html'.format(page_num), 'w') as f:
f.write(driver.page_source)
page_num += 1
driver.quit()

Python Scraping from website

I've tried to write a web scraper for https://www.waug.com/area/?idx=15:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.abcd.com/area/?abc=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
names = soup.select('#good_{} > div > div.class_name > div > div'.format(count))
prices = soup.select('#good_{} > div > div.class_name > div.class_name'.format(count))
for name in names:
while count < 45:
print(name.text)
count = count + 1
for price in prices:
while count < 45:
print(price.text)
count = count + 1
The output is only 45 times first item name and no price. How can I get all item name and price? I want to get item name and price on same line. (I've changed the url and some of the class names just in case)
In order to be sure to get the right name for the right title I'd get the whole "item-good" class.
Then using a for loop would allow me to be sure that the title I am getting matches its price.
Here's an example of how to parse a website with BeautifulSoup:
#!/usr/bin/env python3
#_*_coding:utf8_*_
import requests
from bs4 import BeautifulSoup
url = requests.get('https://www.waug.com/area/?idx=15')
html = url.text
soup = BeautifulSoup(html, 'html.parser')
count = 1
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors delete the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
As per OP's request this is not enough because there is a "more" button to push in the webpage in order to retrieve all the results.
This can be done using Selenium Webdriver.
=== IMPORTANT NOTE ===
In order to make this work you'll need to copy in your script folder also the "chromedriver" file.
You can download it from this Google website.
Here's the script:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
for number in range(10):
try:
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
more_button = browser.find_element_by_id('more_good')
more_button.click()
time.sleep(10)
except:
print "Scrolling is now complete!"
source = browser.page_source
# This source variable should be used as input for BeautifulSoup
print source
Now it is tie to merge the two explained soultions in order to get the final requested result.
Please keep it mind that this is just a quick'n'dirty hack and needs proper error handling and polishing but it should be enough to get you started:
#!/usr/bin/env python3
#_*_coding:utf8_*_
from bs4 import BeautifulSoup
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
browser = webdriver.Chrome()
browser.get('https://www.waug.com/area/?idx=15')
def is_page_load_complete():
close_button = browser.find_element_by_id('close_good');
return close_button.is_displayed();
while(True):
WebDriverWait(browser, 60).until(EC.presence_of_element_located((By.ID, "more_good")))
time.sleep(10)
more_button = browser.find_element_by_id('more_good')
if (more_button.is_displayed()):
more_button.click()
else:
if (is_page_load_complete()):
break
source = browser.page_source
soup = BeautifulSoup(source, 'html.parser')
items = soup.findAll("div", {"class": "item-good"})
for item in items:
item_title = item.find("div", {"class": "good-title-text"})
item_price = item.find("div", {"class": "price-selling"})
print item_title.text + " " + item_price.text
# If you get encoding errors comment the row above and uncomment the one below
#print item_title.text.encode("utf-8") + " " + item_price.text.encode("utf-8")
print "Total items found: " + str(len(items))

StaleElementReferenceException selenium webdriver python

I'm writing a crawler using Selenium, Python and PhantomJS to use Google's reverse image search. So far I've successfully been able to upload an image and crawl the search results on the first page. However, when I try to click on the search results navigation, I'm getting a StaleElementReferenceError. I have read about it in many posts but still I could not implement the solution. Here is the code that breaks:
ele7 = browser.find_element_by_id("nav")
ele5 = ele7.find_elements_by_class_name("fl")
count = 0
for elem in ele5:
if count <= 2:
print str(elem.get_attribute("href"))
elem.click()
browser.implicitly_wait(20)
ele6 = browser.find_elements_by_class_name("rc")
for result in ele6:
f = result.find_elements_by_class_name("r")
for line in f:
link = line.find_elements_by_tag_name("a")[0].get_attribute("href")
links.append(link)
parsed_uri = urlparse(link)
domains.append('{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri))
count += 1
The code breaks at print str(elem.get_attribute("href")) . How can I solve this?
Thanks in advance.
Clicking a link will cause the browser to go to another page; make references to the elements in old page (ele5, elem) invalid.
Modify the code not to reference invalid elements.
For example, you can get urls before you visit other pages:
ele7 = browser.find_element_by_id("nav")
ele5 = ele7.find_elements_by_class_name("fl")
urls = [elem.get_attribute('href') for elem in ele5] # <-----
browser.implicitly_wait(20)
for url in urls[:2]: # <------
print url
browser.get(url) # <------ used `browser.get` instead of `click`.
# ; using `element.click` will cause the error.
ele6 = browser.find_elements_by_class_name("rc")
for result in ele6:
f = result.find_elements_by_class_name("r")
for line in f:
link = line.find_elements_by_tag_name("a")[0].get_attribute("href")
links.append(link)
parsed_uri = urlparse(link)
domains.append('{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri))

Categories

Resources