I'm trying to scrape this site https://acsd.crimegraphics.com, but I am getting only one record. Also, I don't know how to do this in a loop. I'm not able to scrape specific data. Here is the code that I have come up with so far.
import re
import requests
headers = {}
headers["Accept"] = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3"
headers["Accept-Encoding"] = "gzip, deflate, br"
headers["Accept-Language"] = "en-GB,en-US;q=0.9,en;q=0.8"
headers["Cache-Control"] = "max-age=0"
headers["Connection"] = "keep-alive"
headers["Content-Length"] = "31688"
headers["Content-Type"] = "application/x-www-form-urlencoded"
headers["Cookie"] = "ASP.NET_SessionId=4z2ypzsqxnug03cf5ztitpfx; ExistingVisitor=ExistingVisitor=Y; __utma=114720600.1520129764.1565270251.1565270251.1565270251.1; __utmc=114720600; __utmz=114720600.1565270251.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1; __utmb=114720600.1.10.1565270251"
headers["Host"] = "acsd.crimegraphics.com"
headers["Origin"] = "https://acsd.crimegraphics.com"
headers["Referer"] = "https://acsd.crimegraphics.com/2013/default.aspx"
headers["Upgrade-Insecure-Requests"] = "1"
headers["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
payload = {}
payload["__EVENTTARGET"] = "MainMenu$InmatesMenu"
payload["__EVENTARGUMENT"] = "InmatesMenu"
url = "https://acsd.crimegraphics.com/2013/default.aspx"
r = requests.post(url, headers=headers, data=payload)
#print(r.text)
headers1 = {}
headers1["Accept"] = "*/*"
headers1["Accept-Encoding"] = "gzip, deflate, br"
headers1["Accept-Language"] = "en-GB,en-US;q=0.9,en;q=0.8"
headers1["Cache-Control"] = "no-cache"
headers1["Connection"] = "keep-alive"
headers1["Content-Length"] = "58838"
headers1["Content-Type"] = "application/x-www-form-urlencoded; charset=UTF-8"
headers1["Cookie"] = "ASP.NET_SessionId=4z2ypzsqxnug03cf5ztitpfx; __utma=114720600.1520129764.1565270251.1565270251.1565270251.1; __utmc=114720600; __utmz=114720600.1565270251.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmb=114720600.2.10.1565270251"
headers1["Host"] = "acsd.crimegraphics.com"
headers1["Origin"] = "https://acsd.crimegraphics.com"
headers1["Referer"] = "https://acsd.crimegraphics.com/2013/default.aspx"
headers1["User-Agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
payload1 = {}
payload1["ScriptManager1"] = "UpdatePanel1|gvInmates_ob_gvInmatesPE"
payload1["__EVENTTARGET"] = "gvInmates_ob_gvInmatesPE 77886"
payload1["__EVENTARGUMENT"] = ""
payload1["__LASTFOCUS"] = ""
payload1["__VIEWSTATE"] = ""
payload1["__VIEWSTATEGENERATOR"] = "38817EFB"
payload1["__VIEWSTATEENCRYPTED"] = ""
payload1["gvInmates$ob_gvInmatesSelectedRecordsContainer"] = "BOOKNO*_o_eg_*77882*_o_sep_*NAME*_o_eg_*GANNON%2C%20COURTNEY%20BETH*_o_sep_*BOOKDATE*_o_eg_*08%2F07%2F2019%2012%3A40*_o_sep_*ARSTAGENCY*_o_eg_*ACSO*_o_sep_*ARSTLOC*_o_eg_*SAN%20JOAQUIN%20CO*_o_sep_*STATUS*_o_eg_*In%20Custody*_o_osep_*true"
r2 = requests.post(url, headers=headers1, data=payload1)
print(r2.text)
Related
from twocaptcha import TwoCaptcha
import json
import requests
import random
def rand(list):
return random.randrange(0, len(list))
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'
]
headers = {
'user-agent': user_agents[rand(user_agents)],
"accept": "*/*",
"authority": "discord.com",
"method": "POST",
"path": "/api/v9/auth/register",
"scheme": "https",
"origin": "discord.com",
"referer": "discord.com/register",
"x-debug-options": "bugReporterEnabled",
"accept-language": "en-US,en;q=0.9",
"connection": "keep-alive",
"content-Type": "application/json",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiRGlzY29yZCBDbGllbnQiLCJyZWxlYXNlX2NoYW5uZWwiOiJzdGFibGUiLCJjbGllbnRfdmVyc2lvbiI6IjEuMC45MDAzIiwib3NfdmVyc2lvbiI6IjEwLjAuMjIwMDAiLCJvc19hcmNoIjoieDY0Iiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiY2xpZW50X2J1aWxkX251bWJlciI6MTA0OTY3LCJjbGllbnRfZXZlbnRfc291cmNlIjpudWxsfQ==",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin"
}
solver = TwoCaptcha('redacted')
try:
result = solver.hcaptcha(
sitekey='4c672d35-0701-42b2-88c3-78380b0db560',
url='https://discord.com/register',
)
except Exception as e:
sys.exit(e)
else:
print(result['code'])
def getfingerprint():
request_url = "https://discord.com/api/v9/experiments?with_guild_experiments=true"
r = requests.get(request_url , headers = {
"user-agent": user_agents[rand(user_agents)],
"x-context-properties": "eyJsb2NhdGlvbiI6Ii9jaGFubmVscy9AbWUifQ==",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiQ2hyb21lIiwiZGV2aWNlIjoiIiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiYnJvd3Nlcl91c2VyX2FnZW50IjoiTW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEwNS4wLjAuMCBTYWZhcmkvNTM3LjM2IiwiYnJvd3Nlcl92ZXJzaW9uIjoiMTA1LjAuMC4wIiwib3NfdmVyc2lvbiI6IjEwIiwicmVmZXJyZXIiOiIiLCJyZWZlcnJpbmdfZG9tYWluIjoiIiwicmVmZXJyZXJfY3VycmVudCI6IiIsInJlZmVycmluZ19kb21haW5fY3VycmVudCI6IiIsInJlbGVhc2VfY2hhbm5lbCI6InN0YWJsZSIsImNsaWVudF9idWlsZF9udW1iZXIiOjE0NTQyOSwiY2xpZW50X2V2ZW50X3NvdXJjZSI6bnVsbH0="
})
if r.status_code == 200:
fingerprint = json.loads(r.text)["fingerprint"]
print(fingerprint)
else:
return r.text
payload = {
'captcha_key': result['code'],
'consent': True,
'date_of_birth': '2001-11-19',
'email': 'enter email',
'fingerprint': getfingerprint(),
'gift_code_sku_id': None,
'invite': None,
'password': 'enter username',
'username': 'enter password'
}
def register():
data = payload
res = requests.post('https://discord.com/api/v9/auth/register', headers = headers, json = data)
print('Token: ' + res.text)
if __name__ == '__main__':
register()
Code now works and generates accounts
just have to enter your own emails, username and password
can tell when it works cuz it outputs a token
if you get an error mentioning invalid hcaptcha response change email and use a vpn and try again
if you do decide to loop this have it loop slowly or setup proxies
I am learning scrapy and am trying to scrape this realtor site in Quebec. I am using their API to collect homes and print the URLs to the screen. But my last function print_urls() won't run. I really am stuck here i tried debugging it and it just skips right over my whole function block.
class CentrishomesSpider(scrapy.Spider):
name = 'centrisHomes'
# allowed_domains = ['www.centris.ca']
# start_urls = ['http://www.centris.ca/']
def start_requests(self):
query = {...
}
yield scrapy.Request(
url='https://www.centris.ca/property/UpdateQuery',
method='POST',
body=json.dumps(query),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.get_inscriptions
)
...
def get_inscriptions(self, response):
resp, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Query Updated' + Style.RESET_ALL)
else:
print(Fore.RED + 'Query Not Updated' + Style.RESET_ALL)
yield scrapy.Request(
url='https://www.centris.ca/Property/GetInscriptions',
method='POST',
body=json.dumps({"startPosition": 0}),
headers={
'Content-Type': 'application/json',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
},
callback=self.handle_inscriptions
)
def handle_inscriptions(self, response):
homes, success = self.success(response)
if success == True:
print(Fore.GREEN + 'Count ' + str(homes['d']['Result']['count']) + Style.RESET_ALL)
# self.test()
self.html = Selector(text=homes['d']['Result']['html'])
self.print_urls()
# print(response.body)
...
def success(self, response):
my_dict = literal_eval(response.body.decode(
'utf-8').replace(':true}', ':True}'))
if my_dict['d']['Succeeded'] == True:
return my_dict, True
else:
return False
def print_urls(self):
print('try')
# page_html = Selector(resp['d']['Result']['html'])
page_html = self.html
homes = page_html.xpath('//div[contains(#class, "property-thumbnail-item")]')
for home in homes:
yield{
'home_url':home.xpath('.//a[#class="property-thumbnail-summary-link"]/#href').get()
}
...
I figured out my own problem, it was because I turned my print_urls function into a generator, and calling self.print_urls() doesn't make my generator do anything. S/o #AbdealiJK I figured it out because of his answer.
https://stackoverflow.com/a/34609397/19966841
I try many method but they will not solve they show me the error:
All arrays must be of the same length
from bs4 import BeautifulSoup
import requests
import pandas as pd
review = []
ratings = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT": "1",
"Connection": "close",
"Upgrade-Insecure-Requests": "1",
}
for page in range(1, 5):
r = requests.get(
"https://www.amazon.com/s?k=redmi&page=2&qid=1631528810&ref=sr_pg_={page}".format(
page=page
),
headers=headers,
)
soup = BeautifulSoup(r.content, "lxml")
for d in soup.findAll("div", attrs={"class": "s-result-item"}):
rating = d.find("span", attrs={"class": "a-icon-alt"})
if rating is not None:
ratings.append(rating.text)
reviews = d.find("span", class_="a-size-base")
if reviews is not None:
review.append(reviews.text)
df = pd.DataFrame({"rating": ratings, "reviews": review})
df.to_csv("products .csv", index=False, encoding="utf-8")
ratings and reviews aren't the same length and you were scraping wrong containers. I made the necessary modifications, now it should work:
from bs4 import BeautifulSoup
import requests
import pandas as pd
review = []
ratings = []
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0",
"Accept-Encoding": "gzip, deflate",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"DNT": "1",
"Connection": "close",
"Upgrade-Insecure-Requests": "1",
}
for page in range(1, 5):
cookies = {'session': '17ab96bd8ffbe8ca58a78657a918558'}
r = requests.get(
"https://www.amazon.com/s?k=redmi&page=2&qid=1631528810&ref=sr_pg_={page}".format(
page=page
),
headers=headers,
cookies =cookies
)
soup = BeautifulSoup(r.content, "lxml")
for d in soup.select(".s-result-item[data-component-type='s-search-result']"):
rating = d.find("span", attrs={"class": "a-icon-alt"})
if rating is not None:
ratings.append(rating.text)
else:
ratings.append("-")
reviews = d.find("span", class_="a-size-base")
if reviews is not None and rating is not None:
review.append(reviews.text)
else:
review.append("-")
df = pd.DataFrame({"rating": ratings, "reviews": review})
df.to_csv("products.csv", index=False, encoding="utf-8")
i'm sending below request to URL and get the response from it
import requests
url = "http://localhost/dat.txt"
payload = {}
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36',
'Sec-Fetch-Dest': 'document',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'
}
response = requests.request("GET", url, headers=headers, data = payload)
print(response.text.encode('utf8'))
Below is the response data that I get -
mohame4|nameon#example.com|passsd!##$4|head,customer|manager,devlop
mohame3|nameon3#example.com|passsd!##$4|head,customer|manager,devlop
I do this with the data
for i in response.text:
try:
i = i.strip().split('|')
userna = i[0]
emaill = i[1]
passd = i[2]
rol1= i[3]
rol2= i[4]
except:
pass
How can I make rol1 as
this head,customer
to
rol1=['head','customer']
Simply split the string you're getting:
rol1 = i[3].split(',')
You could do this more... gracefully, though, using iterable unpacking:
username, email, password, rol1, rol2 = i.strip().split('|')
rol1 = rol1.split(',')
thanks for all helper special #ForceBru
import requests
url = "http://localhost/dat.txt"
response = requests.request("GET", url)
print(response.text)
dat = str(response.text).split('\n')
for i in dat:
i = i.strip().split('|')
print(i[3].split(","))
# TODO: write code...
I'm trying to scrape WizzAir for personal use. Can't understand what's wrong with my code. Could it be incorrect payload object or cookies?
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8,lt;q=0.6,ru;q=0.4",
"Origin": "https://wizzair.com",
"Referer": "https://wizzair.com/"
}
search_url = "https://wizzair.com/lt-LT/FlightSearch"
session = requests.Session()
r = session.get("https://be.wizzair.com/3.8.2/Api/asset/yellowRibbon", headers=headers, allow_redirects=False)
session_id = r.cookies["ASP.NET_SessionId"]
cookies = {
"ASP.NET_SessionId": session_id,
"HomePageSelector": "FlightSearch",
}
# wizz_url = "https://be.wizzair.com/3.8.2/Api/search/search"
wizz_url = "https://be.wizzair.com/3.8.2/Api/asset/farechart"
payload = {"flightList":[{"departureStation":"VNO","arrivalStation":"FCO","departureDate":"2017-02-20"}],"adultCount":1,"childCount":0,"infantCount":0,"wdc":True, "dayInterval":3}
r = session.post(url=wizz_url,data=payload,headers=headers, cookies=cookies)
print r.content
>>> {"validationCodes":["FlightCount_MustBe_OneOrTwo"]}
I run this - even without session and cookies - and get some data.
You have to send it as JSON - using json=payload
import requests
payload = {
"flightList":[
{
"departureStation": "VNO",
"arrivalStation": "FCO",
"departureDate": "2017-02-20"
}
],
"adultCount": 1,
"childCount": 0,
"infantCount": 0,
"wdc": True,
"dayInterval": 3
}
url = 'https://be.wizzair.com/3.8.2/Api/search/search'
r = requests.post(url, json=payload)
print(r.text)
data = r.json()
print(data['outboundFlights'][0]['flightNumber'])
If you will have to use cookies and headers then use Session and then you don't have to copy cookies and headers from one request to another.
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
#"Accept": "application/json, text/plain, */*",
#"Accept-Encoding": "gzip, deflate, sdch, br",
#"Accept-Language": "en-US,en;q=0.8,lt;q=0.6,ru;q=0.4",
}
s = requests.Session()
s.headers.update(headers)
# to get cookies
r = s.get("https://www.wizzair.com/")
payload = {
"flightList":[
{
"departureStation": "VNO",
"arrivalStation": "FCO",
"departureDate": "2017-02-20"
}
],
"adultCount": 1,
"childCount": 0,
"infantCount": 0,
"wdc": True,
"dayInterval": 3
}
url = 'https://be.wizzair.com/3.8.2/Api/search/search'
r = s.post(url, json=payload)
print(r.text)
data = r.json()
print(data['outboundFlights'][0]['flightNumber'])