I'm new to .NET and Python, but I would like to make a program to scrape .aspx site and work with content there (HTML code is enough). I tried some libraries in Python, but all I got is the first page of that site. Seems like I am building wrong POST data, I don't know the right form of the data, what should be included and what not.
http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018
import requests, urllib, urllib2
r = requests.get("http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018")
content = r.text
print content
start_index = content.find('id="__VIEWSTATE"') + 24
sliced_vs = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__VIEWSTATEGENERATOR"') + 33
sliced_vsg = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__VIEWSTATEENCRYPTED"') + 33
sliced_vse = content[start_index:content.find('"',start_index)]
start_index = content.find('id="__EVENTVALIDATION"') + 30
sliced_EV = content[start_index:content.find('"',start_index)]
form_data = {'__EVENTTARGET': 'gvZverejnenie',
'__EVENTARGUMENT': 'Page$2',
'__VIEWSTATE': sliced_vs,
'__VIEWSTATEGENERATOR': sliced_vsg,
'__VIEWSTATEENCRYPTED': sliced_vse,
'__EVENTVALIDATION': sliced_EV}
data_encoded = urllib.urlencode(form_data)
r = requests.post('http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018',data=data_encoded)
content = r.text
print content
For example, in code I want to get the second page ('Page$2'). I get always the same result, but with different values for ViewState and EventValidation. Where could be the problem, please?
This code requires selenium and chromedriver to control Google Chrome. Turns out there are 476 pages total (following the url you provided).
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options=chrome_options)
driver.get('http://nastenka.lesy.sk/EZOZV/Publish/ObjednavkyZverejnenie.aspx?YR=2018')
with open('page_1.html', 'w') as f:
f.write(driver.page_source)
page_num = 2
while True:
try:
element = driver.find_element_by_link_text(str(page_num))
except NoSuchElementException:
elements = driver.find_elements_by_link_text('...')
if len(elements) == 0:
break # less than 11 pages total
elif len(elements) == 1 and page_num > 12:
break # last page
element = elements[-1]
element.click()
with open('page_{}.html'.format(page_num), 'w') as f:
f.write(driver.page_source)
page_num += 1
driver.quit()
Related
I am a student working on a scraping project and I am having trouble completing my script because it fills my computer's memory with all of the data is stores.
It currently stores all of my data until the end, so my solution to this would be to break up the scrape into smaller bits and then write out the data periodically so it does not just continue to make one big list and then write out at the end.
In order to do this, I would need to stop my scroll method, scrape the loaded profiles, write out the data that I have collected, and then repeat this process without duplicating my data. It would be appreciated if someone could show me how to do this. Thank you for your help :)
Here's my current code:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from selenium.common.exceptions import NoSuchElementException
Data = []
driver = webdriver.Chrome()
driver.get("https://directory.bcsp.org/")
count = int(input("Number of Pages to Scrape: "))
body = driver.find_element_by_xpath("//body")
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
while len(profile_count) < count: # Get links up to "count"
body.send_keys(Keys.END)
sleep(1)
profile_count = driver.find_elements_by_xpath("//div[#align='right']/a")
for link in profile_count: # Calling up links
temp = link.get_attribute('href') # temp for
driver.execute_script("window.open('');") # open new tab
driver.switch_to.window(driver.window_handles[1]) # focus new tab
driver.get(temp)
# scrape code
Name = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[1]/div[2]/div').text
IssuedBy = "Board of Certified Safety Professionals"
CertificationorDesignaationNumber = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[1]/td[3]/div[2]').text
CertfiedorDesignatedSince = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[3]/td[1]/div[2]').text
try:
AccreditedBy = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[3]/div[2]/a').text
except NoSuchElementException:
AccreditedBy = "N/A"
try:
Expires = driver.find_element_by_xpath('/html/body/table/tbody/tr/td/table/tbody/tr/td[5]/div/table[1]/tbody/tr/td[3]/table/tbody/tr[5]/td[1]/div[2]').text
except NoSuchElementException:
Expires = "N/A"
info = Name, IssuedBy, CertificationorDesignaationNumber, CertfiedorDesignatedSince, AccreditedBy, Expires + "\n"
Data.extend(info)
driver.close()
driver.switch_to.window(driver.window_handles[0])
with open("Spredsheet.txt", "w") as output:
output.write(','.join(Data))
driver.close()
Test.py
Displaying Test.py.
Try the below approach using requests and beautifulsoup. In the below script i have used the API URL fetched from website itself for ex:-API URL
First it will create the URL(refer first url) for first iteration, add headers and data in .csv file.
Second iteration it will again create the URL(refer second url) with 2 extra params start_on_page=20 & show_per_page=20 where start_on_page number 20 is incremented by 20 on each iteration and show_per_page = 100 defaulted to extract 100 records per iteration so on till all the data dumped in to the .csv file.second iteration API URL
Script is dumping 4 things number, name, location and profile url.
On each iteration data will be appended to .csv file , so your memory issue will get resolved by this approach.
Do not forget to add your system path in file_path variable where do you want to create .csv file before running the script.
import requests
from urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from bs4 import BeautifulSoup as bs
import csv
def scrap_directory_data():
list_of_credentials = []
file_path = ''
file_name = 'credential_list.csv'
count = 0
page_number = 0
page_size = 100
create_url = ''
main_url = 'https://directory.bcsp.org/search_results.php?'
first_iteration_url = 'first_name=&last_name=&city=&state=&country=&certification=&unauthorized=0&retired=0&specialties=&industries='
number_of_records = 0
csv_headers = ['#','Name','Location','Profile URL']
while True:
if count == 0:
create_url = main_url + first_iteration_url
print('-' * 100)
print('1 iteration URL created: ' + create_url)
print('-' * 100)
else:
create_url = main_url + 'start_on_page=' + str(page_number) + '&show_per_page=' + str(page_size) + '&' + first_iteration_url
print('-' * 100)
print('Other then first iteration URL created: ' + create_url)
print('-' * 100)
page = requests.get(create_url,verify=False)
extracted_text = bs(page.text, 'lxml')
result = extracted_text.find_all('tr')
if len(result) > 0:
for idx, data in enumerate(result):
if idx > 0:
number_of_records +=1
name = data.contents[1].text
location = data.contents[3].text
profile_url = data.contents[5].contents[0].attrs['href']
list_of_credentials.append({
'#':number_of_records,
'Name':name,
'Location': location,
'Profile URL': profile_url
})
print(data)
with open(file_path + file_name ,'a+') as cred_CSV:
csvwriter = csv.DictWriter(cred_CSV, delimiter=',',lineterminator='\n',fieldnames=csv_headers)
if idx == 0 and count == 0:
print('Writing CSV header now...')
csvwriter.writeheader()
else:
for item in list_of_credentials:
print('Writing data rows now..')
print(item)
csvwriter.writerow(item)
list_of_credentials = []
count +=1
page_number +=20
scrap_directory_data()
I would like to scrape all the url links associated with the soccer games included in the table in this website.
Here is the code:
from selenium import webdriver
from bs4 import BeautifulSoup
driver = webdriver.Firefox()
url = 'https://www.coteur.com/cotes-foot.php'
driver.get(url)
fixture1 = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div/div[2]/div/table/tbody/tr[3]/td[3]/a")
print(fixture1.text)
links = []
i = 3
while i <= 6:
fixture = driver.find_element_by_xpath("/html/body/div[3]/div/div[2]/div/div/div[2]/div/table/tbody/tr[" + str(i) + "]/td[3]/a")
links.append(fixture)
i = i + 3
print(links)
driver.close()
When I scrape one match it returns the data I'm expecting. However, when I tried to make a loop to get all the soccer games I run into a problem.
Here is the result of the code:
Betis Seville - Granada 74 Cf
[<selenium.webdriver.firefox.webelement.FirefoxWebElement (session="0199958a-4d31-4a21-9856-8f8c3cc8ee05", element="158fcdaf-501f-41a4-9550-8a42543acc22")>, <selenium.webdriver.firefox.webelement.FirefoxWebElement (session="0199958a-4d31-4a21-9856-8f8c3cc8ee05", element="74e67896-fccb-48da-8eef-bbf8d9a6f3b3")>]
I wanted to get the first element, but I don't get what I was expecting.
This works well
from selenium import webdriver
driver = webdriver.Firefox()
driver.get("https://www.coteur.com/cotes-foot.php")
links = driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]')
data = [l.text for l in links]
print(data)
I tried your code, here is the result :
File "./coteur2.py", line 17
data = [l.text for l in links]
^
IndentationError: unexpected indent
I prefer to use this way :
links = driver.find_elements_by_xpath('//a[contains(#href, "match/cotes-")]')
n = 0
while n < len(links):
links[n] = links[n].text
n = n + 1
print(links)
Thanks for your help
I'm trying to use Requests to get to the "next" (right arrow) page of this URL:
https://www.sportstats.ca/display-results.xhtml?raceid=43572.
I used the chrome developer tools to examine the response when I do it manually in a browser and I've attempted to put together the form data and make a post with Requests, but the response I'm getting back still shows Page 1 in the content. Any tips? I've also tried using Selenium with mixed results, I'd much rather stick to lightweight Requests if at all possible. Here's my attempt:
#!/usr/bin/env python
import requests
from bs4 import BeautifulSoup
url = 'https://www.sportstats.ca/display-results.xhtml?raceid=43572'
with requests.Session() as s:
r1 = s.get(url)
pagenum = [x for x in r1.text.splitlines() if '<p>Page' in x][0].strip()
print(pagenum)
soup = BeautifulSoup(r1.text, 'html.parser')
hidden_inputs = soup.findAll('input', {'type': 'hidden'})
prepayload = {x['name']: x['value'] for x in hidden_inputs}
payload = {}
payload['javax.faces.partial.ajax'] = 'true'
payload['javax.faces.source'] = 'mainForm:j_idt386'
payload['javax.faces.partial.execute'] = 'mainForm'
payload['javax.faces.partial.render'] = 'mainForm:result_table mainForm:pageNav mainForm:eventAthleteDetailsDialog'
payload['mainForm:j_idt386'] = 'mainForm:j_idt386'
payload['mainForm'] = prepayload['mainForm']
payload['mainForm:raceid'] = prepayload['mainForm:raceid']
payload['mainForm:status'] = prepayload['mainForm:status']
payload['mainForm:iframe'] = prepayload['mainForm:iframe']
payload['mainForm:bib'] = ''
payload['mainForm:lastname'] = ''
payload['mainForm:city'] = ''
payload['mainForm:firstname'] = ''
payload['mainForm:province'] = ''
payload['mainForm:categoryFilter'] = 'All Categories'
payload['javax.faces.ViewState'] = prepayload['javax.faces.ViewState']
r2 = s.post(url, data=payload)
pagenum = [x for x in r2.text.splitlines() if '<p>Page' in x][0].strip()
print(pagenum)
This comes back with:
[myname#myserver] $ ./sstest.py
<p>Page 1 / 19
<p>Page 1 / 19
The website you want to scrap is better suited for selenium.
All you need is to get the number of total number of pages from visiting the website, then you loop over the total number of pages and click on the next button once every loop.
During every loop you can do the required parsing for each page as you would normally do.
This way you have a script that dynamically parses each page according to the number of pages in the Website page.
Code:
#!/usr/bin/env python
import time
from bs4 import BeautifulSoup
from selenium import webdriver
# Intializations
driver = webdriver.Chrome()
url = 'https://www.sportstats.ca/display-results.xhtml?raceid=43572'
driver.get(url)
driver.maximize_window()
bs = BeautifulSoup(driver.page_source, 'html.parser')
# Retrieve the total number of pages
PagesParser = driver.find_element_by_xpath('//*[#id="mainForm:pageNav"]/div/p')
pages = int(str(PagesParser.text).split('/')[1].replace(' ', ''))
print(pages)
# Loops over every page
for i in range(1, pages+1):
print('page: ' + str(i))
# Do your parsing here for every page
time.sleep(5)
driver.find_element_by_xpath('//*[#id="mainForm:j_idt386"]').click() # Clicks the next button
Output:
19
page: 1
page: 2
page: 3
page: 4
page: 5
page: 6
page: 7
page: 8
page: 9
page: 10
page: 11
page: 12
page: 13
page: 14
page: 15
page: 16
page: 17
page: 18
page: 19
I have tried many times, but it does not work:
import requests
from lxml import html, etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page=1&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'
driver = webdriver.Chrome()
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
i = int(1)
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
time.sleep(2)
driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click()
time.sleep(2)
# for k, v in jd_goods_data.items():
# print(k,v)
I am trying to download some details, but it doesn't work. If you type 2 to scan, it only downloads one page details, but twice!
Ok, you define q but you do not actually use it as such. In this case, the convention is to name this unused variable as _. I mean, instead of doing
for q in range(page_num):
you should do
for _ in range(page_num):
Thus, other programers will directly know that you do not use q, and only want your operation to be repeated.
Which means that (for some reasons) the line driver.find_element_by_xpath('//*[#id="J_bottomPage"]/span[1]/a[10]').click() does not execute correctly. For sure there is a way to make it work. But in your case, I heuristically see that your url contains a parameter whose name is page. I recommend you to use it instead. Which thus leads to actually use the variable q as such., as follows:
import requests
from lxml import html,etree
from selenium import webdriver
import time, json
#how many page do you want to scan
page_numnotint = input("how many page do you want to scan")
page_num = int(page_numnotint)
file_name = 'jd_goods_data.json'
driver = webdriver.Chrome()
date_info = []
name_data, price_data = [], []
jd_goods_data = {}
for q in range(page_num):
url = 'https://list.jd.com/list.html?cat=1713,3264,3414&page={page}&delivery=1&sort=sort_totalsales15_desc&trans=1&JL=4_10_0#J_main'.format(page=q)
driver.get(url)
base_html = driver.page_source
selctor = etree.HTML(base_html)
i = 1
while True:
name_string = '//*[#id="plist"]/ul/li[%d]/div/div[3]/a/em/text()' %(i)
price_string = '//*[#id="plist"]/ul/li[%d]/div/div[2]/strong[1]/i/text()' %(i)
if i == 60:
break
else:
i += 1
name = selctor.xpath(name_string)[0]
name_data.append(name)
price = selctor.xpath(price_string)[0]
price_data.append(price)
jd_goods_data[name] = price
print(name_data)
with open(file_name, 'w') as f:
json.dump(jd_goods_data, f)
driver.quit()
I scraping a site with Beautiful Soup. The problem I have is that certain parts of the site are paginated with JS, with an unknown (varying) number of pages to scrape.
I'm trying to get around this with a generator, but it's my first time writing one and I'm having a hard time wrapping my head around it and figuring out if what I'm doing makes sense.
Code:
from bs4 import BeautifulSoup
import urllib
import urllib2
import jabba_webkit as jw
import csv
import string
import re
import time
tlds = csv.reader(open("top_level_domains.csv", 'r'), delimiter=';')
sites = csv.writer(open("websites_to_scrape.csv", "w"), delimiter=',')
tld = "uz"
has_next = True
page = 0
def create_link(tld, page):
if page == 0:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain"
else:
link = "https://domaintyper.com/top-websites/most-popular-websites-with-" + tld + "-domain/page/" + repr(page)
return link
def check_for_next(soup):
disabled_nav = soup.find(class_="pagingDivDisabled")
if disabled_nav:
if "Next" in disabled_nav:
return False
else:
return True
else:
return True
def make_soup(link):
html = jw.get_page(link)
soup = BeautifulSoup(html, "lxml")
return soup
def all_the_pages(counter):
while True:
link = create_link(tld, counter)
soup = make_soup(link)
if check_for_next(soup) == True:
yield counter
else:
break
counter += 1
def scrape_page(soup):
table = soup.find('table', {'class': 'rankTable'})
th = table.find('tbody')
test = th.find_all("td")
correct_cells = range(1,len(test),3)
for cell in correct_cells:
#print test[cell]
url = repr(test[cell])
content = re.sub("<[^>]*>", "", url)
sites.writerow([tld]+[content])
def main():
for page in all_the_pages(0):
print page
link = create_link(tld, page)
print link
soup = make_soup(link)
scrape_page(soup)
main()
My thinking behind the code:
The scraper should get the page, determine if there is another page that follows, scrape the current page and move to the next one, repreating the process. If there is no next page, it should stop. Does that make sense how I'm going it here?
As I told you, you could use selenium for programmatically clicking on the Next button, but since that is not an option for you, I can think of the following method to get the number of pages using pure BS4:
import requests
from bs4 import BeautifulSoup
def page_count():
pages = 1
url = "https://domaintyper.com/top-websites/most-popular-websites-with-uz-domain/page/{}"
while True:
html = requests.get(url.format(pages)).content
soup = BeautifulSoup(html)
table = soup.find('table', {'class': 'rankTable'})
if len(table.find_all('tr')) <= 1:
return pages
pages += 1