I'm crawling through some directories with ASP.NET programming via Scrapy.
The pages to crawl through are encoded as such:
javascript:__doPostBack('MoreInfoListZbgs1$Pager','X')
where X is an int between 1 and 180. The problem is that the url remains the same when I clicked next page or any page.
I've written down some codes below which can only extract each link within the first page.
# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup
import re
from scrapy.http import FormRequest
import js2xml
import requests
from datetime import datetime
class nnggzySpider(scrapy.Spider):
name = 'nnggzygov'
start_urls = [
'https://www.nnggzy.org.cn/gxnnzbw/showinfo/zbxxmore.aspx?categorynum=001004001'
]
base_url = 'https://www.nnggzy.org.cn'
custom_settings = {
'LOG_LEVEL': 'ERROR'
}
def parse(self, response):
_response = response.text
self.data = {}
soup = BeautifulSoup(response.body, 'html.parser')
tags = soup.find_all('a', href=re.compile(r"InfoDetail"))
# 获取翻页参数
__VIEWSTATE = re.findall(r'id="__VIEWSTATE" value="(.*?)" />', _response)
A = __VIEWSTATE[0]
# print(A)
__EVENTTARGET = 'MoreInfoListZbgs1$Pager'
B = __EVENTTARGET
__CSRFTOKEN = re.findall(r'id="__CSRFTOKEN" value="(.*?)" />', _response)
C = __CSRFTOKEN
page_num = re.findall(r'title="转到第(.*?)页"', _response)
max_page = page_num[-1]
content = {
'__VIEWSTATE': A,
'__EVENTTARGET': B,
'__CSRFTOKEN': C,
'page_num': max_page
}
infoid = re.findall(r'InfoID=(.*?)&CategoryNum', _response)
print(infoid)
yield scrapy.Request(url=response.url, callback=self.parse_detail, meta={"data": content})
def parse_detail(self, response):
max_page = response.meta['data']['page_num']
for i in range(2, int(max_page)):
data = {
'__CSRFTOKEN': '{}'.format(response.meta['data']['__CSRFTOKEN']),
'__VIEWSTATE': '{}'.format(response.meta['data']['__VIEWSTATE']),
'__EVENTTARGET': 'MoreInfoListZbgs1$Pager',
'__EVENTARGUMENT': '{}'.format(i),
# '__VIEWSTATEENCRYPTED': '',
# 'txtKey': ''
}
yield scrapy.FormRequest(url=response.url, callback=self.parse, formdata=data, method="POST", dont_filter=True)
Can anyone help me with this?
Looks like the pagination over mentioned website is made by sending POST requests with formdata like:
{
"__CSRFTOKEN": ...,
"__VIEWSTATE": ...,
"__EVENTTARGET": "MoreInfoListZbgs1$Pager",
"__EVENTARGUMENT": page_number,
"__VIEWSTATEENCRYPTED": "",
"txtKey": ""
}
I know this is a year old thread but I am posting the answer for future visitors from Google search.
Your form submission didn't work because there must be some more hidden fields at the bottom of the web page but inside the form. In my case, it is and here's the working submission
# This is the next page link
# <a id="nextId" href="javascript:__doPostBack('MoreInfoListZbgs1$Pager','')"> Next </a>
# This is how the website evaluate the next link
# <script type="text/javascript">
# //<![CDATA[
# var theForm = document.forms['Form1'];
# if (!theForm) {
# theForm = document.Form1;
# }
# function __doPostBack(eventTarget, eventArgument) {
# if (!theForm.onsubmit || (theForm.onsubmit() != false)) {
# theForm.__EVENTTARGET.value = eventTarget;
# theForm.__EVENTARGUMENT.value = eventArgument;
# theForm.submit();
# }
# }
# //]]>
# </script>
# According to above js code, we need to pass in the following arguments:
data = {
'__EVENTTARGET': 'MoreInfoListZbgs1$Pager', # first argument from javascript:__doPostBack('MoreInfoListZbgs1$Pager','') next link
'__EVENTARGUMENT': '', # second argument from javascript:__doPostBack('MoreInfoListZbgs1$Pager','') next link, in my case it is empty
'__VIEWSTATE': response.css('input[name=__VIEWSTATE]::attr("value")').get(),
# These are the more hidden input fields you need to pass in
'__VIEWSTATEGENERATOR': response.css('input[name=__VIEWSTATEGENERATOR]::attr("value")').get(),
'__EVENTVALIDATION': response.css('input[name=__EVENTVALIDATION]::attr("value")').get(),
}
yield scrapy.FormRequest(url=form_action_url_here, formdata=data, callback=self.parse)
Related
I am trying to scrape a website using python and scrapy but I have issues with saving the result.
error log i receive:
yield result = {
^
SyntaxError: invalid syntax
When i remove the "result = ", I don't get any error but the reason I am doing that is to save the result as a variable which I use at the last part of the code in "f.write(result)"
The code goes below:
import scrapy
class ExampleSpider(scrapy.Spider):
name = "ufcspider"
start_urls = [
'http://quotes.toscrape.com/page/1/',
]
def parse(self, response):
for quote in response.css('div.quote'):
yield result = {
'text': quote.css('span.text::text').get(),
'author': quote.css('small.author::text').get(),
'link': 'http://quotes.toscrape.com' + quote.css("span a::attr(href)").get(),
'tags': quote.css('div.tags a.tag::text').getall(),
}
next_page = response.css("li.next a::attr(href)").get()
if next_page is not None:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page, callable=self.parse)
page = response.url.split("/")[-2]
filename = f'quotes-{page}.json'
with open(filename, 'wb') as f:
f.write(result)
self.log(f'Saved file {filename}')
First define result, next yield it
result = { ... }
yield result
I am trying to post input data into a form using a requests.session and it's returning a 500 status.
I am expecting to see the search results retrieved.
I was able to get around a previous login issue with __RequestVerificationToken and cookies - thanks to the help of Bertrand Martel. The next step in my process is to get the Search page, which I was able to get successfully. Now failing when I try to post data into the date fields on the form, which make up the search criteria. Works when I manually complete the form and press submit. All seems very straightforward to me, but not sure why it won't work. Is it still a cookies issue? Any help would be appreciated.
Here is my code:
import requests
from bs4 import BeautifulSoup
EMAIL = 'myemail#gmail.com'
PASSWORD = 'somepwd'
LOGIN_URL = 'https://www.idocmarket.com/Security/LogOn'
SEARCH_URL = 'https://www.idocmarket.com/RIOCO/Document/Search'
s = requests.Session()
s.get(LOGIN_URL)
result = s.post(LOGIN_URL, data = {
"Login.Username": EMAIL,
"Login.Password": PASSWORD
})
soup = BeautifulSoup(result.text, "html.parser")
# Report successful login
print("Login succeeded: ", result.ok)
print("Status code:", result.status_code)
result = s.get(SEARCH_URL)
auth_token = soup.find("input", {'name': '__RequestVerificationToken'}).get('value')
print('auth token:', auth_token )
print("Get Search succeaeded: ", result.ok)
print("get Search Statusa code:", result.status_code)
result = s.post(SEARCH_URL, data = {
"__RequestVerificationToken": auth_token,
"StartRecordDate": "03/01/2019",
"EndRecordDate": "03/31/2019",
"StartDocNumber": "",
"EndDocNumber": "",
"Book": "",
"Page": "",
"Instrument": "",
"InstrumentGroup": "",
"PartyType": "Either",
"PartyMatchType": "Contains",
"PartyName": "",
"Subdivision": "",
"StartLot": "",
"EndLot": "",
"Block": "",
"Section":"",
"Township": "",
"Range": "",
"Legal": "",
"CountyKey": "RIOCO"
})
print("post Dates succeeded: ", result.ok)
print("post Dates Status code:", result.status_code)
print(result.text)
It seems that this time, the xsrf token is needed in the post along with all the existing parameters. A simple solution is to get all the input value & pass it to the request :
import requests
from bs4 import BeautifulSoup
LOGIN_URL = 'https://www.idocmarket.com/Security/LogOn'
SEARCH_URL = 'https://www.idocmarket.com/RIOCO/Document/Search'
EMAIL = 'myemail#gmail.com'
PASSWORD = 'somepwd'
s = requests.Session()
s.get(LOGIN_URL)
r = s.post(LOGIN_URL, data = {
"Login.Username": EMAIL,
"Login.Password": PASSWORD
})
if (r.status_code == 200):
r = s.get(SEARCH_URL)
soup = BeautifulSoup(r.text, "html.parser")
payload = {}
for input_item in soup.select("input"):
if input_item.has_attr('name'):
payload[input_item["name"]] = input_item["value"]
payload["StartRecordDate"] = '09/01/2019'
payload["EndRecordDate"] = '09/30/2019'
r = s.post(SEARCH_URL, data = payload)
soup = BeautifulSoup(r.text, "html.parser")
print(soup)
else:
print("authentication failure")
Also using comprehension list for the payload you can write :
temp_pl = [
(t['name'], t['value'])
for t in soup.select("input")
if t.has_attr('name')
]
payload = dict(temp_pl)
payload["StartRecordDate"] = '09/01/2019'
payload["EndRecordDate"] = '09/30/2019'
Here's the part of code that I want to run 4 times. Without counters it works as intended: link to next page is retrieved and scraped for relevant data:
def parse_commits_page(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page)
Here are the variants of the cycle I tried:
Adding a simple global counter:
count = 0
def parse_commits_page(self, response):
global count
while (count < 4):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
count = count + 1
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page)
Adding a sub-function:
def parse_commits_page(self, response):
def grabber( response ):
return {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
yield grabber( response )
for i in range(3):
yield response.follow(
response.xpath('//a[#rel="nofollow"]/#href')[-1].extract(),
callback=grabber
)
In case of counter the response value is updated either once (if placed as in this code) or not at all if count = count + 1 is placed at the end.
In case of sub function response is updated only on the last iteration, resulting in 2 scraped pages instead of 4.
What is the correct way to implement the cycle so that variables are updated as intended?
Here's complete code if that helps(I use 4 defs instead of a cycle right now):
# -*- coding: utf-8 -*-
import scrapy
from random import randint
from time import sleep
BASE_URL = 'https://github.com'
class DiscoverSpider(scrapy.Spider):
name = 'discover_commits_new'
allowed_domains = ['github.com']
start_urls = ['https://github.com/search?utf8=%E2%9C%93&q=stars%3E100&ref=simplesearch']
def parse(self, response):
# Select all the project urls on page
project = BASE_URL + response.xpath('//h3/a[#class="v-align-middle"]/#href').extract_first()
yield response.follow(project, self.parse_project)
# Random wait, so GitHub doesn't ban me right away
sleep(randint(5,20))
# Follow to the next page when every project on this one is scraped
next_page = response.xpath('//a[#rel="next"]/#href').extract_first()
if next_page is not None:
next_page = BASE_URL + next_page
: yield response.follow(next_page, callback=self.parse)
# Parse the main page of the project
def parse_project(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'tags': [x.strip() for x in response.css('.topic-tag::text').extract()],
'lang_name': response.css('.lang::text').extract(),
'lang_perc' : response.css('.percent::text').extract(),
'stars': response.css('.social-count::text').extract()[1].strip(),
'forks': response.css('.social-count::text').extract()[2].strip(),
'commits': response.css('.text-emphasized::text').extract()[0].strip(),
'contributors': response.css('.text-emphasized::text').extract()[3].strip()
}
commits_page = BASE_URL + response.xpath('//*[#class="commits"]//#href').extract_first()
yield response.follow(commits_page, self.parse_commits_page)
# Get last commits
def parse_commits_page(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page1)
def parse_commits_page1(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page2)
def parse_commits_page2(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
next_page = response.xpath('//a[#rel="nofollow"]/#href')[-1].extract()
yield response.follow(next_page, callback=self.parse_commits_page3)
def parse_commits_page3(self, response):
yield {
'author': response.xpath('//a[#rel="author"]/text()').extract(),
'name': response.xpath('//strong/a/text()').extract(),
'last_commits': response.xpath('//relative-time/text()').extract()
}
I wang to use scrapy and python 2.7.11 to stimulate a FormRequest to crawl http://www.istic.ac.cn/suoguan/QiKan_ShouYe.htm?lan=en&journalId=IELEP0229&yp=2018
Here is my code:
def start_requests(self):
posturl = 'http://www.istic.ac.cn/suoguan/essearch.ashx'
url = 'http://www.istic.ac.cn/suoguan/QiKan_ShouYe.htm?lan=en&journalId=IELEP0229&yp=2018'
journalId = re.search(r'journalId=(.*?)&', url).group(1)
yearNum = re.search(r'&yp=(\d+)', url).group(1)
postdata = {
"indexname" : "xw_qk",
"search" : "{0}/F(F_ReqNum)*{1}/F(F_YEAR)".format(journalId, yearNum),
"page" : "0",
"pagenum" : "20",
"sort" : "",
"type" : "content",
}
print journalId, yearNum
print postdata
self.logger.info('Visit_headpage........................')
yield FormRequest(posturl, formdata = postdata, callback = self.parse_item)
I need to post following data to the form:
indexname=xw_qk&
search=IELEP0229%2F(F_ReqNum)*2018%2F(F_YEAR)
&page=0&pagenum=20&sort=&type=content
to crawl the page correctly.
But my response is nothing, so I use fiddler to find the data posted form, and it is:
indexname=xw_qk&
search=IELEP0229%2FF%28F_ReqNum%29%2A2018%2FF%28F_YEAR%29
&page=0&pagenum=20sort=&&type=content
So it means these three signals get wrong decode:'(', ')', '*'.
But when I print formdata in scrapy log, it still in the right format:
{indexname':'xw_qk', 'search':'IELEP0229/(F_ReqNum)*2018/(F_YEAR)',
'page':'0', 'pagenum':'20', 'sort':'', 'type':'content}
So how can I solve it?
I suggest to use Request(method='POST') instead of FormRequest() because I had many troubles using this function.
And also try to append params directly into posturl like this
yield Request(url= posturl + "?search="+"{0}/F(F_ReqNum)*{1}/F(F_YEAR)".format(journalId, yearNum, method='POST')
and concatenate other params too,
They are sending the same thing (scrapy's FormRequest is only url-encoded) but what I think it's occurring is that it need to have a cookie received when you land first at http://www.istic.ac.cn/suoguan/QiKan_ShouYe.htm?lan=en&journalId=IELEP0229&yp=2018, please try the following:
# -*- coding: utf-8 -*-
import json
import re
import scrapy
from scrapy import FormRequest
class IsticSpider(scrapy.Spider):
name = "istic"
allowed_domains = ["istic.ac.cn"]
start_urls = ['http://www.istic.ac.cn/suoguan/QiKan_ShouYe.htm?lan=en&journalId=IELEP0229&yp=2018']
def parse(self, response):
posturl = 'http://www.istic.ac.cn/suoguan/essearch.ashx'
journalId = re.search(r'journalId=(.*?)&', response.url).group(1)
yearNum = re.search(r'&yp=(\d+)', response.url).group(1)
postdata = {
"indexname" : "xw_qk",
"search" : "{0}/F(F_ReqNum)*{1}/F(F_YEAR)".format(journalId, yearNum),
"page" : "0",
"pagenum" : "20",
"sort" : "",
"type" : "content",
}
yield FormRequest(posturl, formdata = postdata, callback = self.parse_item)
def parse_item(self, response):
data = json.loads(response.body_as_unicode())
self.logger.debug('%s', data.keys())
It should output [u'facets', u'hits', u'took']
Hope you are all well! I'm new and using Python 2.7! I'm tring to extract emails from a public available directory website that does not seems to have API: this is the site: http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search
, the code stop gathering email where on the page at the bottom where it says "load more"!
Here is my code:
import requests
import re
from bs4 import BeautifulSoup
file_handler = open('mail.txt','w')
soup = BeautifulSoup(requests.get('http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search').content)
tags = soup('a')
list_new =[]
for tag in tags:
if (re.findall(r'href="mailto:([^"#]+#[^"]+)">\1</a>',('%s'%tag))): list_new = list_new +(re.findall(r'href="mailto:([^"#]+#[^"]+)">\1</a>', ('%s'%tag)))
for x in list_new:
file_handler.write('%s\n'%x)
file_handler.close()
How can i make sure that the code goes till the end of the directory and does not stop where it shows load more?
Thanks.
Warmest regards
You just need to post some data, in particular incrementing group_no to simulate clicking the load more button:
from bs4 import BeautifulSoup
import requests
# you can set whatever here to influence the results
data = {"group_no": "1",
"search": "category",
"segment": "",
"activity": "",
"retail": "",
"category": "",
"Bpark": "",
"alpha": ""}
post = "http://www.tecomdirectory.com/getautocomplete_keyword.php"
with requests.Session() as s:
soup = BeautifulSoup(
s.get("http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search").content,
"html.parser")
print([a["href"] for a in soup.select("a[href^=mailto:]")])
for i in range(1, 5):
data["group_no"] = str(i)
soup = BeautifulSoup(s.post(post, data=data).content, "html.parser")
print([a["href"] for a in soup.select("a[href^=mailto:]")])
To go until the end, you can loop until the post returns no html, that signifies we cannot load any more pages:
def yield_all_mails():
data = {"group_no": "1",
"search": "category",
"segment": "",
"activity": "",
"retail": "",
"category": "",
"Bpark": "",
"alpha": ""}
post = "http://www.tecomdirectory.com/getautocomplete_keyword.php"
start = "http://www.tecomdirectory.com/companies.php?segment=&activity=&search=category&submit=Search"
with requests.Session() as s:
resp = s.get(start)
soup = BeautifulSoup(s.get(start).content, "html.parser")
yield (a["href"] for a in soup.select("a[href^=mailto:]"))
i = 1
while resp.content.strip():
data["group_no"] = str(i)
resp = s.post(post, data=data)
soup = BeautifulSoup(resp.content, "html.parser")
yield (a["href"] for a in soup.select("a[href^=mailto:]"))
i += 1
So if we ran the function like below setting "alpha": "Z" to just iterate over the Z's:
from itertools import chain
for mail in chain.from_iterable(yield_all_mails()):
print(mail)
We would get:
mailto:info#10pearls.com
mailto:fady#24group.ae
mailto:pepe#2heads.tv
mailto:2interact#2interact.us
mailto:gc#worldig.com
mailto:marilyn.pais#3i-infotech.com
mailto:3mgulf#mmm.com
mailto:venkat#4gid.com
mailto:info#4power.biz
mailto:info#4sstudyabroad.com
mailto:fouad#622agency.com
mailto:sahar#7quality.com
mailto:mike.atack#8ack.com
mailto:zyara#emirates.net.ae
mailto:aokasha#zynx.com
Process finished with exit code 0
You should put a sleep in between requests so you don't hammer the server and get yourself blocked.