I know there is builtin middleware to handle downloadings. but it only accept a url. but in my case, my downloading link is a POST request.
When i made that POST request pdf file starts downloading.
Now i want to download that file from POST request in scrapy.
Website is http://scrb.bihar.gov.in/View_FIR.aspx
You can enter district Aurangabad and police station Kasma PS
On last column status there is a link to downloading file.
ps_x = '//*[#id="ctl00_ContentPlaceHolder1_ddlPoliceStation"]//option[.="Kasma PS"]/#value'
police_station_val = response.xpath(ps_x).extract_first()
d_x = '//*[#id="ctl00_ContentPlaceHolder1_ddlDistrict"]//option[.="Aurangabad"]/#value'
district_val = response.xpath(d_x).extract_first()
viewstate = response.xpath(self.viewstate_x).extract_first()
viewstategen = response.xpath(self.viewstategen_x).extract_first()
eventvalidator = response.xpath(self.eventvalidator_x).extract_first()
eventtarget = response.xpath(self.eventtarget_x).extract_first()
eventargs = response.xpath(self.eventargs_x).extract_first()
lastfocus = response.xpath(self.lastfocus_x).extract_first()
payload = {
'__EVENTTARGET': eventtarget,
'__EVENTARGUMENT': eventargs,
'__LASTFOCUS': lastfocus,
'__VIEWSTATE': viewstate,
'__VIEWSTATEGENERATOR': viewstategen,
'__EVENTVALIDATION': eventvalidator,
'ctl00$ContentPlaceHolder1$ddlDistrict': district_val,
'ctl00$ContentPlaceHolder1$ddlPoliceStation': police_station_val,
'ctl00$ContentPlaceHolder1$optionsRadios': 'radioPetioner',
'ctl00$ContentPlaceHolder1$txtSearchBy': '',
'ctl00$ContentPlaceHolder1$rptItem$ctl06$lnkStatus.x': '21',
'ctl00$ContentPlaceHolder1$rptItem$ctl06$lnkStatus.y': '24',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Origin': 'http://scrb.bihar.gov.in',
'Upgrade-Insecure-Requests': '1',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Referer': 'http://scrb.bihar.gov.in/View_FIR.aspx',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'en-US,en;q=0.9',
}
# req = requests.post(response.url, data=payload, headers=headers)
# with open('pdf/ch.pdf', 'w+b') as f:
# f.write(req.content)
When You click donwload, webbrowser sends POST request.
So this answer mentioned by El Ruso earlier is applyable in your case
.....
def parse(self, response):
......
yield scrapy.FormRequest("http://scrb.bihar.gov.in/View_FIR.aspx",.#your post request configuration, callback=self.save_pdf)
def save_pdf(self, response):
path = response.url.split('/')[-1]
self.logger.info('Saving PDF %s', path)
with open(path, 'wb') as f:
f.write(response.body)
Related
hope you're doing well !
So i'm trying to scrape data from Booking (name of Hotel , room ..) , i run the code it's work but i don't get the data in the excel file, the data file is empty !
This is my code :
# Create an Extractor by reading from the YAML file
e = Extractor.from_yaml_file('C:/Users/pc/OneDrive/Bureau/booking-hotel-scraper-master/booking.yml')
def scrape(url):
headers = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'Upgrade-Insecure-Requests': '1',
# You may want to change the user agent if you get blocked
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Referer': 'https://www.booking.com/index.en-gb.html',
'Accept-Language': 'en-GB,en-US;q=0.9,en;q=0.8',
}
# Download the page using requests
print("Downloading %s"%url)
r = requests.get(url, headers=headers)
# Pass the HTML of the page and create
return e.extract(r.text,base_url=url)
with open("C:/Users/pc/OneDrive/Bureau/booking-hotel-scraper-master/urls.txt",'r') as urllist, open('C:/Users/pc/OneDrive/Bureau/booking-hotel-scraper-master/data.csv','w') as outfile:
fieldnames = [
"name",
"location",
"price",
"price_for",
"room_type",
"beds",
"rating",
"rating_title",
"number_of_ratings",
"url"
]
writer = csv.DictWriter(outfile, fieldnames=fieldnames,quoting=csv.QUOTE_ALL)
writer.writeheader()
for url in urllist.readlines():
data = scrape(url)
if data and data['hotels'] is not None:
for h in data["hotels"]:
writer.writerow(h)
And this is the result in the excel file :
There is no error in mycode it's only about how to get this data.
The booking.yml :
I found the following TikTok Downloader which is working fine.
from argparse import ArgumentParser
import os
from urllib.parse import parse_qsl, urlparse
import requests
class TikTokDownloader:
HEADERS = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'video',
'Referer': 'https://www.tiktok.com/',
'Accept-Language': 'en-US,en;q=0.9,bs;q=0.8,sr;q=0.7,hr;q=0.6',
'sec-gpc': '1',
'Range': 'bytes=0-',
}
def __init__(self, url: str, web_id: str):
self.__url = url
self.__cookies = {
'tt_webid': web_id,
'tt_webid_v2': web_id
}
def __get_video_url(self) -> str:
response = requests.get(self.__url, cookies=self.__cookies, headers=TikTokDownloader.HEADERS)
return response.text.split('"playAddr":"')[1].split('"')[0].replace(r'\u0026', '&')
def download(self, file_path: str):
video_url = self.__get_video_url()
url = urlparse(video_url)
params = tuple(parse_qsl(url.query))
request = requests.Request(method='GET',
url='{}://{}{}'.format(url.scheme,
url.netloc, url.path),
cookies=self.__cookies,
headers=TikTokDownloader.HEADERS,
params=params)
prepared_request = request.prepare()
session = requests.Session()
response = session.send(request=prepared_request)
response.raise_for_status()
if os.path.exists(file_path):
choice = input('File already exists. Overwrite? (Y/N): ')
if choice.lower() != 'y':
return
with open(os.path.abspath(file_path), 'wb') as output_file:
output_file.write(response.content)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('--web-id', help='Value of tt_webid or tt_webid_v2 cookie (they are the same).')
parser.add_argument('-o', '--output', default='download.mp4', help='Full output path.')
parser.add_argument('url', help='Video url (https://www.tiktok.com/#username/video/1234567890123456789 or https://vm.tiktok.com/a1b2c3/).')
args = parser.parse_args()
downloader = TikTokDownloader(args.url, args.web_id)
downloader.download(args.output)
The issue is that I have to run this command to download each video:
python3 ./tiktok.py --web-id 1234567890123 -o ./file.mp4 https://vm.tiktok.com/...
And I have 1000 links to download. All the links are in A txt file without comma. Like:
Https://tiktok.com/1
Https://tiktok.com/2
Https://tiktok.com/3
So- I'm looking to find a way to read the text file and automatically replace the link in the command that I have to run. Or should I change the actual script?
Use my code please, I have just defined a function that will help you to download all those videos by just entering the path where the file with a thousand links is located, preferably save this python script in the same directory where your file with a thousand links is located:
Use the function
A_thousand_links_jbsidis("my_file_with_1000_links.txt")
This is going to put automatic names to each video based on date and time, I tested it and it works!
Here is the code by jbsidis:
from argparse import ArgumentParser
import os
from urllib.parse import parse_qsl, urlparse
import requests
class TikTokDownloaderjbsidis:
HEADERS = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'DNT': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36',
'Accept': '*/*',
'Sec-Fetch-Site': 'same-site',
'Sec-Fetch-Mode': 'no-cors',
'Sec-Fetch-Dest': 'video',
'Referer': 'https://www.tiktok.com/',
'Accept-Language': 'en-US,en;q=0.9,bs;q=0.8,sr;q=0.7,hr;q=0.6',
'sec-gpc': '1',
'Range': 'bytes=0-',
}
def __init__(self, url, web_id):
self.__url = url
self.__cookies = {
'tt_webid': web_id,
'tt_webid_v2': web_id
}
def __get_video_url(self) -> str:
response = requests.get(self.__url, cookies=self.__cookies, headers=TikTokDownloaderjbsidis.HEADERS)
return response.text.split('"playAddr":"')[1].split('"')[0].replace(r'\u0026', '&')
def download(self, file_path: str):
video_url = self.__get_video_url()
url = urlparse(video_url)
params = tuple(parse_qsl(url.query))
request = requests.Request(method='GET',
url='{}://{}{}'.format(url.scheme,
url.netloc, url.path),
cookies=self.__cookies,
headers=TikTokDownloaderjbsidis.HEADERS,
params=params)
prepared_request = request.prepare()
session = requests.Session()
response = session.send(request=prepared_request)
response.raise_for_status()
if os.path.exists(file_path):
choice = str('jbsidis File already exists. Overwrite? (Y/N): ')
print("Downloading jbsidis == "+str(file_path))
with open(os.path.abspath(file_path), 'wb') as output_file:
output_file.write(response.content)
import time
import random
def A_thousand_links_jbsidis(file_with_a_thousand_links):
n=open(file_with_a_thousand_links).read()
m=n.splitlines() #guessing the links are per line
MyWebIDis="1234567890123" #put the id that works for you
c=0
for new_url in m:
c=c+1
new_auto_file_name=str(c)+" - "+str(time.strftime("_%Y%m%d_%H%M%S_"))+"_video_"+".mp4" #i guess they are mp4
clean_url=str(new_url).replace("\n","").replace("\x0a","").replace("\x0d","").replace(" ","")
downloader = TikTokDownloaderjbsidis(clean_url, MyWebIDis)
downloader.download(new_auto_file_name)
time.sleep(10) #just in case the internet is not that fast, wait 10 seconds after next download
A_thousand_links_jbsidis("my_file_with_1000_links.txt")
And here is the image, I don't know why sometimes we answer questions without giving a real solution, greetings from El Salvador.
jbsidis
I am accessing a URL https://streeteasy.com/sales/all which does not show the page unless Cookie is set. I am having no idea how this cookie value being generated. I highly doubt that cookie value is fixed so I guess I can't use a hard-coded Cookie value either.
Code below:
import requests
from bs4 import BeautifulSoup
headers = {
'authority': 'streeteasy.com',
'pragma': 'no-cache',
'cache-control': 'no-cache',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'referer': 'https://streeteasy.com/sales/all',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9,ur;q=0.8',
'cookie': 'D_SID=103.228.157.1:Bl5GGXCWIxq4AopS1Hkr7nkveq1nlhWXlD3PMrssGpU; _se_t=0944dfa5-bfb4-4085-812e-fa54d44acc54; google_one_tap=0; D_IID=AFB68ACC-B276-36C0-8718-13AB09A55E51; D_UID=23BA0A61-D0DF-383D-88A9-8CF65634135F; D_ZID=C0263FA4-96BF-3071-8318-56839798C38D; D_ZUID=C2322D79-7BDB-3E32-8620-059B1D352789; D_HID=CE522333-8B7B-3D76-B45A-731EB750DF4D; last_search_tab=sales; se%3Asearch%3Asales%3Astate=%7C%7C%7C%7C; streeteasy_site=nyc; se_rs=123%2C1029856%2C123%2C1172313%2C2815; se%3Asearch%3Ashared%3Astate=102%7C%7C%7C%7Cfalse; anon_searcher_stage=initial; se_login_trigger=4; se%3Abig_banner%3Asearch=%7B%22123%22%3A2%7D; se%3Abig_banner%3Ashown=true; se_lsa=2019-07-08+04%3A01%3A30+-0400; _ses=BAh7DEkiD3Nlc3Npb25faWQGOgZFVEkiJWRiODVjZTA1NmYzMzZkMzZiYmU4YTk4Yjk5YmU5ZTBlBjsAVEkiEG5ld192aXNpdG9yBjsARlRJIhFsYXN0X3NlY3Rpb24GOwBGSSIKc2FsZXMGOwBUSSIQX2NzcmZfdG9rZW4GOwBGSSIxbTM5eGRPUVhLeGYrQU1jcjZIdi81ajVFWmYzQWFSQmhxZThNcG92cWxVdz0GOwBGSSIIcGlzBjsARmkUSSIOdXNlcl9kYXRhBjsARnsQOhBzYWxlc19vcmRlckkiD3ByaWNlX2Rlc2MGOwBUOhJyZW50YWxzX29yZGVySSIPcHJpY2VfZGVzYwY7AFQ6EGluX2NvbnRyYWN0RjoNaGlkZV9tYXBGOhJzaG93X2xpc3RpbmdzRjoSbW9ydGdhZ2VfdGVybWkjOhltb3J0Z2FnZV9kb3ducGF5bWVudGkZOiFtb3J0Z2FnZV9kb3ducGF5bWVudF9kb2xsYXJzaQJQwzoSbW9ydGdhZ2VfcmF0ZWYJNC4wNToTbGlzdGluZ3Nfb3JkZXJJIhBsaXN0ZWRfZGVzYwY7AFQ6EHNlYXJjaF92aWV3SSIMZGV0YWlscwY7AFRJIhBsYXN0X3NlYXJjaAY7AEZpAXs%3D--d869dc53b8165c9f9e77233e78c568f610994ba7',
}
session = requests.Session()
response = session.get('https://streeteasy.com/for-sale/downtown', headers=headers, timeout=20)
if response.status_code == 200:
html = response.text
soup = BeautifulSoup(html, 'lxml')
links = soup.select('h3 > a')
print(links)
https://www.kralilan.com/liste/kiralik-bina
This is the website I am trying to scrape. When you open the website, the listings are generated with an ajax request. The same request keeps populating page whenever you scroll down. This is how they implemented infinite scrolling...
I found out this is the request sent to the server when I scroll down and I tried to simulate the same request with headers and request payload. This is my spider.
class MySpider(scrapy.Spider):
name = 'kralilanspider'
allowed_domains = ['kralilan.com']
start_urls = [
'https://www.kralilan.com/liste/satilik-bina'
]
def parse(self, response):
headers = {'Referer': 'https://www.kralilan.com/liste/kiralik-bina',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
#'Content-Type': 'application/json; charset=utf-8',
#'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
}
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
headers=headers,
callback=self.parse_ajax
)
def parse_ajax(self, response):
yield {'data': response.text}
If I uncomment the commented headers, request fails with status code 400 or 500.
I tried to send request payload as a body in the parse method. That didn't work either.
If I try to yield response.body, I get TypeError: Object of type bytes is not JSON serializable.
What am I missing here?
The following implementation will fetch you the response you would like to grab. You missed the most important part data to pass as a parameter in your post requests.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
def start_requests(self):
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
body=json.dumps(self.data),
headers={"content-type": "application/json"}
)
def parse(self, response):
items = json.loads(response.text)['d']
yield {"data":items}
In case you wanna parse data from multiple pages (new page index is recorded when you scroll downward), the following will do the trick. The pagination is within index key in your data.
import json
import scrapy
class MySpider(scrapy.Spider):
name = 'kralilanspider'
data = {'incomestr':'["Bina","1",-1,-1,-1,-1,-1,5]', 'intextstr':'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', 'index':0 , 'count':'10' , 'opt':'1' , 'type':'3'}
headers = {"content-type": "application/json"}
url = 'https://www.kralilan.com/services/ki_operation.asmx/getFilter'
def start_requests(self):
yield scrapy.Request(
url=self.url,
method='POST',
body=json.dumps(self.data),
headers=self.headers,
meta={'index': 0}
)
def parse(self, response):
items = json.loads(response.text)['d']
res = scrapy.Selector(text=items)
for item in res.css(".list-r-b-div"):
title = item.css(".add-title strong::text").get()
price = item.css(".item-price::text").get()
yield {"title":title,"price":price}
page = response.meta['index'] + 1
self.data['index'] = page
yield scrapy.Request(self.url, headers=self.headers, method='POST', body=json.dumps(self.data), meta={'index': page})
Why do you ignore POST body? You need to submit it too:
def parse(self, response):
headers = {'Referer': 'https://www.kralilan.com/liste/kiralik-bina',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json; charset=utf-8',
'X-Requested-With': 'XMLHttpRequest',
#'Content-Length': 246,
#'Connection': 'keep-alive',
}
payload = """
{ incomestr:'["Bina","2",-1,-1,-1,-1,-1,5]', intextstr:'{"isCoordinates":false,"ListDrop":[],"ListText":[{"id":"78","Min":"","Max":""},{"id":"107","Min":"","Max":""}],"FiyatData":{"Max":"","Min":""}}', index:'0' , count:'10' , opt:'1' , type:'3'}
"""
yield scrapy.Request(
url='https://www.kralilan.com/services/ki_operation.asmx/getFilter',
method='POST',
body=payload,
headers=headers,
callback=self.parse_ajax
)
For fun, I'm trying to use Python requests to log on to my school's student portal. This is what I've come up with so far. I'm trying to be very explicit on the headers, because I'm getting a 200 status code (the code you also get when failing to login) instead of a 302 (successful login).
import sys
import os
import requests
def login(username, password):
url = '(link)/home.html#sign-in-content'
values = {
'translator_username' : '',
'translator_password' : '',
'translator_ldappassword' : '',
'returnUrl' : '',
'serviceName' : 'PS Parent Portal',
'serviceTicket' : '',
'pcasServerUrl' : '\/',
'credentialType' : 'User Id and Password Credential',
'account' : username,
'pw' : password,
'translatorpw' : password
}
headers = {
'accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding' : 'gzip, deflate, br',
'accept-language' : 'en-US,en;q=0.9',
'cache-control' : 'max-age=0',
'connection' : 'keep-alive',
'content-type' : 'application/x-www-form-urlencoded',
'host' : '(link)',
'origin' : '(link)',
'referer' : '(link)guardian/home.html',
'upgrade-insecure-requests' : '1'
}
with requests.Session() as s:
p = s.post(url, data=values)
if p.status_code == 302:
print(p.text)
print('Authentication error', p.status_code)
r = s.get('(link)guardian/home.html')
print(r.text)
def main():
login('myname', 'mypass')
if __name__ == '__main__':
main()
Using Chrome to examine the network requests, all of these headers are under 'Request Headers' in addition to a long cookie number, content-length, and user-agent.
The forms are as follows:
pstoken:(token)
contextData:(text)
translator_username:
translator_password:
translator_ldappassword:
returnUrl:(url)guardian/home.html
serviceName:PS Parent Portal
serviceTicket:
pcasServerUrl:\/
credentialType:User Id and Password Credential
account:f
pw:(id)
translatorpw:
Am I missing something with the headers/form names? Is it a problem with cookies?
If I look at p.requests.headers, this is what is sent:
{'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36', 'accept-encoding': 'gzip, deflate, br', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'connection': 'keep-alive', 'accept-language': 'en-US,en;q=0.9', 'cache-control': 'max-age=0', 'content-type': 'application/x-www-form-urlencoded', 'host': '(url)', 'origin': '(url)', 'referer': '(url)guardian/home.html', 'upgrade-insecure-requests': '1', 'Content-Length': '263'}
p.text gives me the HTML of the login page
Tested with PowerAPI, requests, Mechanize, and RoboBrowser. All fail.
What response do you expect? You are using a wrong way to analyze your response.
with requests.Session() as s:
p = s.post(url, data=values)
if p.status_code == 302:
print(p.text)
print('Authentication error', p.status_code)
r = s.get('(link)guardian/home.html')
print(r.text)
In your code, you print out Authentication error ignoring status_code, I think it at least should like this:
with requests.Session() as s:
p = s.post(url, data=values)
if p.status_code == 302:
print(p.text)
r = s.get('(link)guardian/home.html')
print(r.text)
else:
print('Authentication error', p.status_code)