I am trying to parse the JSON string from this page but couldn't make it to convert the JSON into a Python dict via json.loads. Here is my starter code:
import requests
import re
import json
import html
headers = {
"authority": "www.budgetpetproducts.com.au",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "en,ru;q=0.9",
"cache-control": "max-age=0",
# 'cookie': '_ALGOLIA=anonymous-6d2b7032-518b-4da0-b0b9-964e00920f3d; scarab.visitor=%2245DBAC5A87F9C262%22; _fbp=fb.2.1674782859328.577366252; _tt_enable_cookie=1; _ttp=jTkn1Rm860eDgPiXlG45pfHRDG7; _gid=GA1.3.415748101.1675529773; _ga=GA1.3.583864494.1674782855; _uetsid=d505d7e0a4ac11ed962a7de239a71ea1; _uetvid=c75d7e109de111eda4efb3023338d7ad; XSRF-TOKEN=eyJpdiI6IkJjNVpiMzNQRGFRYzJ6Z1ZUR2NEVFE9PSIsInZhbHVlIjoiSGNyN2J4dUtza1JadUhqUFAwWklWcFdIVmZLSC84OEp4TUtRYWdLQzBhUU1GK2psMzFHQW5SVTlZZm1Yd0xmaGt3QWFoRDVsNVYyRGdKYVRKbUZSak9UMzlCZEhXc0FubjdORERraE5nRHNsYWViVkxZZCt6d2VkbGNGNjhNWlciLCJtYWMiOiIyZjQxMzk5YjZjZWNmM2E1MmVjYmQxODAxNDY3ZWY1MTZiM2MyNzcwODBmY2ZlNWM5YTVmMDU4MWMwMDViZjQ5In0%3D; budget_pet_products_session=eyJpdiI6IkdnVHVrTjlkUGY1SGxMN0lZWTVsckE9PSIsInZhbHVlIjoibERhNnllekN5azRhMEptSU9QeGZ3VkVaYUpsUkxGbi9rR21yMXArWEoycWdlMWJSZFRnL3BtOUhBMXBXQ0syVEJPbUtaOVVpLzRkdFBwRDZsUU45V3lxd1JzK2lTU1RyLzkyM24xcTR2TUUvQVdrdEV0VHpoRDFIVFNBZHJTVTgiLCJtYWMiOiIwNDAzMTM3YmQ5YWE3ZWY3NmE3MjA3OGEyMTZmMWM5ODY3ZjlkOGZjNDEyYzkzNmM5MzhjY2ZkODcyNmU3N2NjIn0%3D; scarab.profile=%221677%7C1675608432%7C8787%7C1675608372%7C3624%7C1675146623%22; _ga_6YGE1ZKCTV=GS1.1.1675608368.2.1.1675608533.0.0.0',
"referer": "https://www.budgetpetproducts.com.au/dog/food?sort=best_match",
"sec-ch-ua": '"Not?A_Brand";v="8", "Chromium";v="108", "Yandex";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.1.1038 (beta) Yowser/2.5 Safari/537.36",
}
response = requests.get(
"https://www.budgetpetproducts.com.au/product/royal-canin-maxi-adult-dry-dog-food-4kg/1679",
headers=headers,
)
data = (
re.search(":data=(.*)", response.text)
.group(1)
.replace(':is-mobile="false"></product-page-component>', "")
.replace(""", '"')
.replace("'", '\\"')
)
clean_data = html.unescape(data)
json_blob = json.loads(clean_data)
print(json_blob)
Above code is giving JSONDecodeError:
File "/usr/lib/python3.9/json/decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 4 (char 3)
I check the JSON if it is valid on JSON formatter, and it is valid: https://jsoneditoronline.org/#left=cloud.c7e2f35696094a07a49afba2d18c6ad4
As advised in the comment by Abolfazi Ghaemi change appropriate line to:
json_blob = json.loads(clean_data[1:-2])
in order to get printed:
{'info': {'id': 1679, 'name': {'title': 'Royal Canin Maxi Adult Dry Dog Food 4kg', 'text': 'Royal Canin Maxi Adult Dry Dog Food 4kg', 'icon': None, 'slug': 'royal-canin-maxi-adult-dry-dog-food-4kg'}, 'category': {'html': ['Dog', ...
Related
I need two sets of data from this website:
https://www.nasdaq.com/market-activity/stocks/aapl/institutional-holdings
Which include both the "Active Positions" and "New and Sold Out Positions" tables. The code i have can only provide one piece of data into a JSON:
import requests
import pandas as pd
url = 'https://api.nasdaq.com/api/company/AAPL/institutional-holdings?limit=15&type=TOTAL&sortColumn=marketValue&sortOrder=DESC'
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.nasdaq.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
r = requests.get(url, headers=headers)
df = pd.json_normalize(r.json()['data']['newSoldOutPositions']['rows'])
df.to_json('AAPL_institutional_positions.json')
This will give the output of the following (JSON):
{
"positions":{
"0":"New Positions",
"1":"Sold Out Positions"
},
"holders":{
"0":"99",
"1":"90"
},
"shares":{
"0":"37,374,118",
"1":"4,637,465"
}
}
Whereas, for the other table I am scraping, I use this code (All's I have done is change "newSoldOutPositions" to "activePositions"):
import requests
import pandas as pd
url = 'https://api.nasdaq.com/api/company/AAPL/institutional-holdings?limit=15&type=TOTAL&sortColumn=marketValue&sortOrder=DESC'
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.nasdaq.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
r = requests.get(url, headers=headers)
df = pd.json_normalize(r.json()['data']['activePositions']['rows'])
df.to_json('AAPL_institutional_positions.json')
Which gives this output (JSON):
{
"positions":{
"0":"Increased Positions",
"1":"Decreased Positions",
"2":"Held Positions",
"3":"Total Institutional Shares"
},
"holders":{
"0":"1,780",
"1":"2,339",
"2":"283",
"3":"4,402"
},
"shares":{
"0":"239,170,203",
"1":"209,017,331",
"2":"8,965,339,255",
"3":"9,413,526,789"
}
}
So my question being, is how can i combine the scraping to grab both sets of data and output them all in one JSON file?
Thanks
If you only want json data, there is no need to use pandas:
import requests
nasdaq_dict = {}
url = 'https://api.nasdaq.com/api/company/AAPL/institutional-holdings?limit=15&type=TOTAL&sortColumn=marketValue&sortOrder=DESC'
headers = {
'accept': 'application/json, text/plain, */*',
'origin': 'https://www.nasdaq.com',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.79 Safari/537.36'
}
r = requests.get(url, headers=headers)
nasdaq_dict['activePositions'] = r.json()['data']['activePositions']['rows']
nasdaq_dict['newSoldOutPositions'] = r.json()['data']['newSoldOutPositions']['rows']
print(nasdaq_dict)
Result in terminal:
{'activePositions': [{'positions': 'Increased Positions', 'holders': '1,795', 'shares': '200,069,709'}, {'positions': 'Decreased Positions', 'holders': '2,314', 'shares': '228,105,026'}, {'positions': 'Held Positions', 'holders': '308', 'shares': '8,976,744,094'}, {'positions': 'Total Institutional Shares', 'holders': '4,417', 'shares': '9,404,918,829'}], 'newSoldOutPositions': [{'positions': 'New Positions', 'holders': '121', 'shares': '55,857,143'}, {'positions': 'Sold Out Positions', 'holders': '73', 'shares': '8,851,038'}]}
I need the variable defined as NUEMRODEDNI to be tempered, how is it done?
Excuse me I'm a newbie
I don't know how to use a variable defined in python, I need it to be replaced as shown in the image
import string
import requests
from requests.structures import CaseInsensitiveDict
url = "www.url.com:8022/SistemaIdentificacion/servlet/com.personas.consultapersona?030bf8cfcd4bfccbd543df61b1b43f67,gx-no-cache=1648440596691"
NUEMRODEDNI = "41087712"
headers = CaseInsensitiveDict()
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"
headers["Accept"] = "*/*"
headers["Accept-Language"] = "es-AR,es;q=0.8,en-US;q=0.5,en;q=0.3"
headers["Accept-Encoding"] = "gzip, deflate"
headers["GxAjaxRequest"] = "1"
headers["Content-Type"] = "application/json"
headers["AJAX_SECURITY_TOKEN"] = "a6da9873adb..."
headers["X-GXAUTH-TOKEN"] = "eyJ0eXAiOiJ..."
headers["Origin"] = "http://www.url.com"
headers["Connection"] = "keep-alive"
headers["Referer"] = "www.url.com/SistemaIdentificacion/servlet/com.personas.consultapersona"
headers["Cookie"] = "GX_CLIENT_ID=0496f100-9e4e-4e36-a68d-ba3770ee2bff; GX_SESSION_ID=KUqyHU%2FZbpu96sYlj7Gry8bCYpV6CaSgVk0BLxVCpAU%3D; JSESSIONID=1812E6AC00940BDB325EF9592CB93FF8; GxTZOffset=America/Argentina/Buenos_Aires"
data = '{"MPage":false,"cmpCtx":"","parms":[EDIT HERE,{"s":"M","v":[["","(None)"],["F","Femenino"],["M","Masculino"]]},"M",{"User":"","CompanyCode":0,"Profile":"","UsrOneLoginID":"6647","Depid":1,"UsrLP":6488,"unidad":"","unidadid":"68","IMFParteCuerpo":"","denunciasid":0,"destino":"68","TipoPersona":"","NombreArchivo":"","denorigen":"","macdestinoscodorganigrama":""}],"hsh":[],"objClass":"consultapersona","pkgName":"com.personas","events":["ENTER"],"grids":{}}'
resp = requests.post(url, headers=headers, data=data)
print(resp.content)
Please provide your code typed out rather than a screenshot of it, so that we can simply copy & run it on our end.
Nontheless you should try:
NUMERODNI = "41087712"
data = '{"MPage":false,"cmpCtx":"","parms":[EDIT HERE,{"s":"M","v":[["","(None)"],["F","Femenino"],["M","Masculino"]]},"M",{"User":"","CompanyCode":0,"Profile":"","UsrOneLoginID":"6647","Depid":1,"UsrLP":6488,"unidad":"","unidadid":"68","IMFParteCuerpo":"","denunciasid":0,"destino":"68","TipoPersona":"","NombreArchivo":"","denorigen":"","macdestinoscodorganigrama":""}],"hsh":[],"objClass":"consultapersona","pkgName":"com.personas","events":["ENTER"],"grids":{}}'
data = data.replace("EDIT HERE", NUMERODNI)
print(data) # '{... "parms":[41087712, ...}'
This solution definetly delivers the desired string as result.
If your code still does not do what you would like it to, then the actual issue has to be somewhere else.
Since you're POSTing JSON data, it's easier to just keep your data as a Python dict and tell Requests to JSON-ify it (requests.post(json=...)).
That way you don't need string substitution, just use the variable.
I also took the opportunity to make your headers construction shorter – it can just be a dict.
import requests
url = "www.url.com:8022/SistemaIdentificacion/servlet/com.personas.consultapersona?030bf8cfcd4bfccbd543df61b1b43f67,gx-no-cache=1648440596691"
NUEMRODEDNI = "41087712"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "*/*",
"Accept-Language": "es-AR,es;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"GxAjaxRequest": "1",
"Content-Type": "application/json",
"AJAX_SECURITY_TOKEN": "a6da9873adb...",
"X-GXAUTH-TOKEN": "eyJ0eXAiOiJ...",
"Origin": "http://www.url.com",
"Connection": "keep-alive",
"Referer": "www.url.com/SistemaIdentificacion/servlet/com.personas.consultapersona",
"Cookie": "GX_CLIENT_ID=0496f100-9e4e-4e36-a68d-ba3770ee2bff; GX_SESSION_ID=KUqyHU%2FZbpu96sYlj7Gry8bCYpV6CaSgVk0BLxVCpAU%3D; JSESSIONID=1812E6AC00940BDB325EF9592CB93FF8; GxTZOffset=America/Argentina/Buenos_Aires",
}
data = {
"MPage": False,
"cmpCtx": "",
"parms": [
NUEMRODEDNI,
{"s": "M", "v": [["", "(None)"], ["F", "Femenino"], ["M", "Masculino"]]},
"M",
{
"User": "",
"CompanyCode": 0,
"Profile": "",
"UsrOneLoginID": "6647",
"Depid": 1,
"UsrLP": 6488,
"unidad": "",
"unidadid": "68",
"IMFParteCuerpo": "",
"denunciasid": 0,
"destino": "68",
"TipoPersona": "",
"NombreArchivo": "",
"denorigen": "",
"macdestinoscodorganigrama": "",
},
],
"hsh": [],
"objClass": "consultapersona",
"pkgName": "com.personas",
"events": ["ENTER"],
"grids": {},
}
resp = requests.post(url, headers=headers, json=data)
resp.raise_for_status()
print(resp.content)
I am trying to convert my api crawler into a Scrapy api crawler, but I'm getting a 403 error in scrapy only.
2022-01-29 11:48:14 [scrapy.core.engine] DEBUG: Crawled (403) <POST https://api2.realtor.ca/Listing.svc/PropertySearch_Post/> (referer: https://www.realtor.ca/)
What am I doing wrong on scrapy side? I used Insomnia to generate the working POST request. But scrapy can't seem to handle it, and I don't know why.
Working code:
import http.client
import json
conn = http.client.HTTPSConnection("api2.realtor.ca")
payload = "ZoomLevel=11&LatitudeMax=43.98268&LongitudeMax=-78.96028&LatitudeMin=43.43223&LongitudeMin=-79.79249&Sort=6-D&PropertySearchTypeId=1&TransactionTypeId=2&PropertyTypeGroupID=1&Currency=CAD&CurrentPage=1&ApplicationId=1&CultureId=1&Version=7.0&RecordsPerPage=200"
headers = {
'cookie': "gig_bootstrap_3_mrQiIl6ov44s2X3j6NGWVZ9SDDtplqV7WgdcyEpGYnYxl7ygDWPQHqQqtpSiUfko=gigya-pr_ver4; visid_incap_2271082=JLYUNoQeQ8aK7XCukUKVyLTR4GEAAAAAQUIPAAAAAAD5E9h/kC3MKXYFk8j751oC; reese84=3:kSScWfkGFsGrIfH0oClFcA==:7I6p2X8egFPYsN4vhOQSzcG+bY31pkxS/fXEEvP9k7YrM9pKJIRxaMywKm1+U1DhuZilDxW9dT97UvcEqz9H8d6b81yJvbuEOsNCc1hlGdVd3ZjT2uSjne23lT3z3KZ2lj0sxMQRKgafW63RLwfBQWtCgxWHMp9k338xzLwnTwqzOMI+qkBLPMwUolKoJP3IL6gzbljA6PY7Pyaa/R7VMRlGzcnk+M/ILnqZV8kJ+xV8sukeGZAcs4nJqJbvRr6vSE+fsAe/Jct5iAOnYTGoC+IyUdOAIWn2jEXTjwdjndvGdsKGvvkvcnkIjX1/JNTFH922HZPL3q9LzFCEgMhuYgygFRFjCqaz/HJ0zr0DXL9uTjiripD29T9lA0k0fGs+vyMz0ApV2Ni9UJsosr747CQMLROcZCQSIik2p+Taflc3cIAZgituGSiJss6o94xm:40hT11A7LacDXW+pa+D9sBQF5meDdTDqnBCeCZs/jls=; visid_incap_2269415=SnrfkV/xRM2/zMAwUwhS+8xj4GEAAAAAQkIPAAAAAACALOWhAWPmyCDb18oF6pHY12yEiWlUbIgr; nlbi_2269415=Vq3lX/59Tg3xXSVkkG5lugAAAAB+8mCpAlS1F5Timw4irGua; _gid=GA1.2.1583316853.1643415717; ASP.NET_SessionId=ov1iizcd3npobsilupzii5ep; nlbi_2271082=uYrYPyycJwjJZMEUcbDG1QAAAABYkXBfZhGO2s/HVfvOZarc; incap_ses_303_2269415=tkGkDvUztDFYLuufDHk0BPFK9WEAAAAAaFTGABendGhY7YQ6DfHGqg==; incap_ses_303_2271082=R007Dwk7akm7gfCfDHk0BPhS9WEAAAAAEzSDycWG9SwrHnLXWGiOuQ==; reese84=3:xy6Z/sx1YkpUszEJSoRL3A==:lovsunYlSfQpJkP59Bs11wY2+LlgzOMeiL3ObotvVpDwGDJVy4RKfSLLKuvVMLLcTstWzewelf4RKJ3e6v5hAVv9wkqa01hiSd1TIDBTDdyPzcKMI0xlq6r0G2P+8dMx91eZ0jKEx40QdURnU3XwLghg1BALZ+aWt4US7pC1FIftQksLhz7QlnyBw8pl2ucIJ9JIyuM3gBjNaP4hvYyc17UOnBvf37wtLhWeFb+fomUnnLqyTag5dM/vASoIg+Uo+lH9yQI2K9xGm0KveqgF2nUre9Z+UG1gwHWRBEIjygnhZZjnJOR20wxQU7gOZ8YqW+DJdczgSiqbn93I1um+VwOlf8bD6zCq99miEtaVOdVlGZesCvoe9d9JciEryAFMlYcn3RuLvycVNPQVrdCP9REneI+J1AmfXeSveEGpLhnSZs64rniGIf7iT0lRY9c1:f519Rour27xdzG5PjxP0BlHw/5uwjBdnwdY9Zd3AWpU=; nlbi_2269415_2147483646=5DLdcUXrjg0v1GhykG5lugAAAAA0AOEmZsShmR8VQ3d3LJzx; _dc_gtm_UA-12908513-11=1; _ga=GA1.2.2104488426.1642095581; _gac_UA-12908513-11=1.1643468700.Cj0KCQiA6NOPBhCPARIsAHAy2zAmFT3_yol1CanQDHoHW_z8aJ6HgaY2f7iilRt6yGvssuzmDbbh8FoaAkpxEALw_wcB; _ga_Y07J3B53QP=GS1.1.1643467388.20.1.1643468729.19; _4c_=%7B%22_4c_s_%22%3A%22bZNbj5swEIX%2FSuTnhfgKJm%2BrVKpWatWqUp8jgwdiLcHIOKHbKP%2B9Y3JZ7ap%2BAZ85%2BoxnDmcy72EgG1ZIIQtdCiGUfiKv8DaRzZkEZ9PjRDakrUBWitqsVIxmEjjNdNGYzIAxFFdbcUmeyJ%2BFVemSKc2oKi5PxA53hoXWHPt4tymqpZaSCS3Q5sZ486WPYVJrWWquH17kX5XkvRMN%2BW89zA%2FUtaBoKcUH66KgtRlv1jNpvAVksipnMsdLthPS41%2BUMkEpvo%2FB22MTd%2FFtTMYZ6tVkX7Fg4eQa2M3Oxn0icMbf1T24bh8XWVVJHgNueE6vS1UFlYWUGkuzG6yfP3Nu6mdOHfw8QWJt98EfYPXd166HVVVi0eP8yPNgg8cRYkOghRAW8z7Gcdqs1%2FM85533XQ954w9rNE0upmsFMH30IW%2FMTcM0vMvZIn8zQ3c03dKutPVdB3b1MqSgmH4C1H4Gf3JDkyw%2FhmiC8yhu%2FXGIIeG2ZjA2oX7B5CwM0Zneh60%2FHCC4xvTLiY8K%2BmBIAx9Dus7X593vly%2Bpi4zKlBRe5JgaTiuFsSOXew6FUooJLjTDQUZk6kIuTUfH6REiK2lZK6gzJoBnsq55hjNpMuxzyy2tVQv6EW38TSoqmCjlDSmuwMvlHw%3D%3D%22%7D; _gali=lnkNextResultsPage",
'authority': "api2.realtor.ca",
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': "*/*",
'content-type': "application/x-www-form-urlencoded; charset=UTF-8",
'sec-ch-ua-mobile': "?1",
'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36",
'sec-ch-ua-platform': '"Android"',
'origin': "https://www.realtor.ca",
'sec-fetch-site': "same-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://www.realtor.ca/",
'accept-language': "en-US,en;q=0.9"
}
conn.request("POST", "/Listing.svc/PropertySearch_Post", payload, headers)
res = conn.getresponse()
data = res.read()
out = json.loads(data.decode("utf-8"))
results = out.get('Results')
for r in results:
print(r.get('Id'), r.get('Property').get('Price'))
print(len(results))
Broken scrapy code with 403 error:
# -*- coding: utf-8 -*-
import scrapy
import json
from scrapy.exceptions import CloseSpider
class RealtorSpider(scrapy.Spider):
name = 'realtor'
allowed_domains = ['api2.realtor.ca']
api = "https://api2.realtor.ca/Listing.svc/PropertySearch_Post/"
payload = "ZoomLevel=11&LatitudeMax=43.98268&LongitudeMax=-78.96028&LatitudeMin=43.43223&LongitudeMin=-79.79249&Sort=6-D&PropertySearchTypeId=1&TransactionTypeId=2&PropertyTypeGroupID=1&Currency=CAD&CurrentPage=1&ApplicationId=1&CultureId=1&Version=7.0&RecordsPerPage=200"
headers = {
'cookie': "gig_bootstrap_3_mrQiIl6ov44s2X3j6NGWVZ9SDDtplqV7WgdcyEpGYnYxl7ygDWPQHqQqtpSiUfko=gigya-pr_ver4; visid_incap_2271082=JLYUNoQeQ8aK7XCukUKVyLTR4GEAAAAAQUIPAAAAAAD5E9h/kC3MKXYFk8j751oC; reese84=3:kSScWfkGFsGrIfH0oClFcA==:7I6p2X8egFPYsN4vhOQSzcG+bY31pkxS/fXEEvP9k7YrM9pKJIRxaMywKm1+U1DhuZilDxW9dT97UvcEqz9H8d6b81yJvbuEOsNCc1hlGdVd3ZjT2uSjne23lT3z3KZ2lj0sxMQRKgafW63RLwfBQWtCgxWHMp9k338xzLwnTwqzOMI+qkBLPMwUolKoJP3IL6gzbljA6PY7Pyaa/R7VMRlGzcnk+M/ILnqZV8kJ+xV8sukeGZAcs4nJqJbvRr6vSE+fsAe/Jct5iAOnYTGoC+IyUdOAIWn2jEXTjwdjndvGdsKGvvkvcnkIjX1/JNTFH922HZPL3q9LzFCEgMhuYgygFRFjCqaz/HJ0zr0DXL9uTjiripD29T9lA0k0fGs+vyMz0ApV2Ni9UJsosr747CQMLROcZCQSIik2p+Taflc3cIAZgituGSiJss6o94xm:40hT11A7LacDXW+pa+D9sBQF5meDdTDqnBCeCZs/jls=; visid_incap_2269415=SnrfkV/xRM2/zMAwUwhS+8xj4GEAAAAAQkIPAAAAAACALOWhAWPmyCDb18oF6pHY12yEiWlUbIgr; nlbi_2269415=Vq3lX/59Tg3xXSVkkG5lugAAAAB+8mCpAlS1F5Timw4irGua; _gid=GA1.2.1583316853.1643415717; ASP.NET_SessionId=ov1iizcd3npobsilupzii5ep; nlbi_2271082=uYrYPyycJwjJZMEUcbDG1QAAAABYkXBfZhGO2s/HVfvOZarc; incap_ses_303_2269415=tkGkDvUztDFYLuufDHk0BPFK9WEAAAAAaFTGABendGhY7YQ6DfHGqg==; incap_ses_303_2271082=R007Dwk7akm7gfCfDHk0BPhS9WEAAAAAEzSDycWG9SwrHnLXWGiOuQ==; reese84=3:xy6Z/sx1YkpUszEJSoRL3A==:lovsunYlSfQpJkP59Bs11wY2+LlgzOMeiL3ObotvVpDwGDJVy4RKfSLLKuvVMLLcTstWzewelf4RKJ3e6v5hAVv9wkqa01hiSd1TIDBTDdyPzcKMI0xlq6r0G2P+8dMx91eZ0jKEx40QdURnU3XwLghg1BALZ+aWt4US7pC1FIftQksLhz7QlnyBw8pl2ucIJ9JIyuM3gBjNaP4hvYyc17UOnBvf37wtLhWeFb+fomUnnLqyTag5dM/vASoIg+Uo+lH9yQI2K9xGm0KveqgF2nUre9Z+UG1gwHWRBEIjygnhZZjnJOR20wxQU7gOZ8YqW+DJdczgSiqbn93I1um+VwOlf8bD6zCq99miEtaVOdVlGZesCvoe9d9JciEryAFMlYcn3RuLvycVNPQVrdCP9REneI+J1AmfXeSveEGpLhnSZs64rniGIf7iT0lRY9c1:f519Rour27xdzG5PjxP0BlHw/5uwjBdnwdY9Zd3AWpU=; nlbi_2269415_2147483646=5DLdcUXrjg0v1GhykG5lugAAAAA0AOEmZsShmR8VQ3d3LJzx; _dc_gtm_UA-12908513-11=1; _ga=GA1.2.2104488426.1642095581; _gac_UA-12908513-11=1.1643468700.Cj0KCQiA6NOPBhCPARIsAHAy2zAmFT3_yol1CanQDHoHW_z8aJ6HgaY2f7iilRt6yGvssuzmDbbh8FoaAkpxEALw_wcB; _ga_Y07J3B53QP=GS1.1.1643467388.20.1.1643468729.19; _4c_=%7B%22_4c_s_%22%3A%22bZNbj5swEIX%2FSuTnhfgKJm%2BrVKpWatWqUp8jgwdiLcHIOKHbKP%2B9Y3JZ7ap%2BAZ85%2BoxnDmcy72EgG1ZIIQtdCiGUfiKv8DaRzZkEZ9PjRDakrUBWitqsVIxmEjjNdNGYzIAxFFdbcUmeyJ%2BFVemSKc2oKi5PxA53hoXWHPt4tymqpZaSCS3Q5sZ486WPYVJrWWquH17kX5XkvRMN%2BW89zA%2FUtaBoKcUH66KgtRlv1jNpvAVksipnMsdLthPS41%2BUMkEpvo%2FB22MTd%2FFtTMYZ6tVkX7Fg4eQa2M3Oxn0icMbf1T24bh8XWVVJHgNueE6vS1UFlYWUGkuzG6yfP3Nu6mdOHfw8QWJt98EfYPXd166HVVVi0eP8yPNgg8cRYkOghRAW8z7Gcdqs1%2FM85533XQ954w9rNE0upmsFMH30IW%2FMTcM0vMvZIn8zQ3c03dKutPVdB3b1MqSgmH4C1H4Gf3JDkyw%2FhmiC8yhu%2FXGIIeG2ZjA2oX7B5CwM0Zneh60%2FHCC4xvTLiY8K%2BmBIAx9Dus7X593vly%2Bpi4zKlBRe5JgaTiuFsSOXew6FUooJLjTDQUZk6kIuTUfH6REiK2lZK6gzJoBnsq55hjNpMuxzyy2tVQv6EW38TSoqmCjlDSmuwMvlHw%3D%3D%22%7D; _gali=lnkNextResultsPage",
'authority': "api2.realtor.ca",
'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"',
'accept': "*/*",
'content-type': "application/x-www-form-urlencoded; charset=UTF-8",
'sec-ch-ua-mobile': "?1",
'user-agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36",
'sec-ch-ua-platform': '"Android"',
'origin': "https://www.realtor.ca",
'sec-fetch-site': "same-site",
'sec-fetch-mode': "cors",
'sec-fetch-dest': "empty",
'referer': "https://www.realtor.ca/",
'accept-language': "en-US,en;q=0.9"
}
def start_requests(self):
yield scrapy.Request(url=self.api,
callback=self.parse,
method='POST',
headers=self.headers,
body=json.dumps(self.payload) )
def parse(self, response):
print('PARSE')
results = json.loads(response.body).get('Results')
print('results')
for r in results:
yield {
'Id': r.get('Id'),
'Price': r.get('Property').get('Price')
}
if __name__ == '__main__':
import os
from scrapy.cmdline import execute
os.chdir(os.path.dirname(os.path.realpath(__file__)))
SPIDER_NAME = RealtorSpider.name
try:
execute(
[
'scrapy',
'crawl',
SPIDER_NAME,
'-s',
'FEED_EXPORT_ENCODING=utf-8',
]
)
except SystemExit:
pass
You have an excess / in the url. Remove it to avoid 404 error.
You don't need json.dumps since payload is already a string.
Let scrapy handle the cookies itself.
I guess that your headers are wrong because after changing them I don't get 403 status anymore.
See how to run scrapy from a script.
spider.py:
import scrapy
class RealtorSpider(scrapy.Spider):
name = 'realtor'
allowed_domains = ['api2.realtor.ca']
custom_settings = {
'DOWNLOAD_DELAY': 0.5,
}
api = "https://api2.realtor.ca/Listing.svc/PropertySearch_Post"
payload = "ZoomLevel=11&LatitudeMax=43.98268&LongitudeMax=-78.96028&LatitudeMin=43.43223&LongitudeMin=-79.79249&Sort=6-D&PropertySearchTypeId=1&TransactionTypeId=2&PropertyTypeGroupID=1&Currency=CAD&CurrentPage=1&ApplicationId=1&CultureId=1&Version=7.0&RecordsPerPage=200"
headers = {
"Accept": "*/*",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.5",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
"DNT": "1",
"Host": "api2.realtor.ca",
"Origin": "https://www.realtor.ca",
"Pragma": "no-cache",
"Referer": "https://www.realtor.ca/",
"Sec-Fetch-Dest": "empty",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Site": "same-site",
"Sec-GPC": "1",
"TE": "trailers",
'User-Agent': "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Mobile Safari/537.36",
}
def start_requests(self):
yield scrapy.Request(url=self.api,
callback=self.parse,
method='POST',
headers=self.headers,
body=self.payload)
def parse(self, response):
print('PARSE')
results = response.json().get('Results')
print('results')
for r in results:
yield {
'Id': r.get('Id'),
'Price': r.get('Property').get('Price')
}
main.py:
from scrapy.crawler import CrawlerProcess
from scrapy.utils.project import get_project_settings
if __name__ == "__main__":
spider = 'realtor'
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl(spider)
process.start()
output:
{'Id': '23983739', 'Price': '$799,000'}
{'Id': '23983759', 'Price': '$579,000'}
{'Id': '23983752', 'Price': '$750,000'}
...
...
...
I am currently trying to implement a re to my code for Mcdonald locations across Canada. The goal is to add a column to my csv that states if the locations has walmart. All the address in "address1" that are in walmart have a tag that says (walmart). I am hoping to separate it,if anyone can help with that would be great! If there is a way to do it in excel that would be just as good.
import csv
import json
import requests
import re
url = "https://www.mcdonalds.com/googleapps/GoogleRestaurantLocAction.do?method=searchLocation&latitude=43.6936965&longitude=-79.2969938&radius=1000000&maxResults=1700&country=ca&language=en-ca&showClosed=&hours24Text=Open%2024%20hr"
payload = {}
files = {}
headers = {
"authority": "www.mcdonalds.com",
"sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
"accept": "*/*",
"x-requested-with": "XMLHttpRequest",
"sec-ch-ua-mobile": "?0",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.mcdonalds.com/ca/en-ca/restaurant-locator.html",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cookie": "bm_sz=C04645E7F7A956C5F9D9C5A20DEAEC97~YAAQ1Cv2SEtfMBN6AQAAItxfEwwTVV2V2Tr7UWpPt1Ps7gl84FzQlmbWIm4kBBh5dxlK3w8RenwiEiKtvERE6dLmrwPwJUuy+14gU/LeEZvP+uxzyBr04oQXdcSEQuiOgdkAGasqnBrTw1mp5E5iehnRpvHBDdSqh8wRSgJV0eG4f8YwSz66BfntCBALtQNCAFK2; _abck=F05779F2345218EA4989FF467D897C5A~0~YAAQ1Cv2SExfMBN6AQAAItxfEwaIwCrBeP25JBhBb7TX+HmnLQgrj1TkosrB+oHSv9ctrxRukqEDUaHPL1KkjpqjY1XY1yyulQ0ZRhsEfhY968YVsTOqfiosAu3kykd3pJG/bQ37XHwWs5qXpIdhMXRwJwXmkYtl3ETG8kXK2iZ22Q31COaSjNVACLaa7s9tCk9ItgLvUj5x9Nldjnd8AdXR0pXicrQY1IaruJyNqwMcJv42AUHW7iH4Ex9ZOSYsgEjLMNd44mS525X/gSNUTSOzoqoWsnH4MU59vfgLTwc2hVncAv67LBViTLxbWw4eVAvz7Z5phQfCmvoIy0PD8gy5iwPDMaD3GASrK9xScDPAPUI2wquxmSJ+f2cQaxZQKhvJCeH9cz14OZfx8ksA2ss53E0l0kDvgmnw~-1~-1~-1; ak_bmsc=BA4817D8DEE20E92C1E6251C54FC124348F62BD48F5F00005F91C9608B679D5F~plUkbYfsvYr5dCayJ9dMGEJ3QDgkmkv2mLpE7pCY9vW0xrdawvmyxfSnupw/4F7C48Akdn8PKsBniqz+7F+RZb8v4AkvH3c0RuvnynqJoni+kJcDYtPOxdMvdtGdTlZGIkSQNfpcxHNQDVlzojdSBX0vyBh/8seKQv10U67M7m787olYzg9jnsUwk3/VHBrnMDogiWJT8rNV7saSXunN0pAgucZWo/XhCpTJL+tI9urt0=; MCDCountry_code=US; bm_mi=BEE06312635FD442995BC0237BAFDA7C~f/RxgMW/JJSUc/wB9ZRg9fPD/76+wq/TaoWEZR1/ttrAiVTO256xhDTsVYc/kdHIjWkxvfO4XDcBjqe4hQ4qXt8Anpfi09vna/zcC7l6OVWpWeRSoZNztl7h5VF407L3XG+9CpzjSHNcaqAPRk5d0J5gLMtL/KmR8XBkAC0Syim7ST97nxNrPfLdlkSPMGm4Oy86xvY5PH5Nu47zS/gwhanBFg69tAdrQdaZewE2eGuzoJPsZit3UsihTzhXc4LY92hfSdh3/kZRId+NE8Jp0w==; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN4a9yst3R170rBCm1egzGvCBmB1jq9aCwQm5VgIJgloPOdpiIPfD3kDxFbKhqMuS5U=; JSESSIONID=64PZkBXhhpvNjM4NganzSZ0r1npIIaM7Fo84EsxN.eap7node7; _abck=F05779F2345218EA4989FF467D897C5A~-1~YAAQ1Cv2SExyMBN6AQAA5Et0EwZueCejZbKz1VDGCq2sB43Yx4dq0SiiGeUS6gVpXRIdw3rA3OdpNGHq7tVzQ+IvPpEKwLML9736x1qB5SQxV3jai89y2B2QF6K8nKtyrDAes0qbeTyIrHu0Rh1HLs7CjNxiLi0wswbCZfSsPI6fJZiEt+Itre3lfmua/HkhIRwpVTKqlVN5eQ8XIX+s1jJbINx/jUmMTW+jB5k4A5NARGChYH7rJQGYIT/oyZYpSbS3Yweqa4FRgGMW4gYZBN39+t2xSfewADLdpihfOnoZtakw9VhcvAKaf4mEzjB7WEfNJIZSjSE8DzvbJNIF41MGuAhhrnEBwBE8uVCZsA+2qjVPSADVp2Nn8JanJXCbucnLFOLsmPz3oVtGzentht1cHog4+eYOUlmw~0~-1~-1; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN5ZCTzA250oKEeVeXaa6j4gEGJ9RRtrTXQdYXzzSx6fM9aLwif+We2vtIc1yLQgTt4=",
"dnt": "1",
}
response = requests.request(
"GET", url, headers=headers, data=payload, files=files
)
stores = json.loads(response.text)
with open("McDonaldworkshop
\.csv", mode="w") as CSVFile:
writer = csv.writer(
CSVFile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
)
writer.writerow(
[
"address",
"postcode",
"telephone",
]
)
for store in stores["features"]:
address = store["properties"]["addressLine1"]
post_code = store["properties"]["postcode"]
telephone = store["properties"].get("telephone", "N/A")
writer.writerow([address,post_code, telephone])
Try:
import csv
import json
import requests
url = "https://www.mcdonalds.com/googleapps/GoogleRestaurantLocAction.do?method=searchLocation&latitude=43.6936965&longitude=-79.2969938&radius=1000000&maxResults=1700&country=ca&language=en-ca&showClosed=&hours24Text=Open%2024%20hr"
payload = {}
files = {}
headers = {
"authority": "www.mcdonalds.com",
"sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
"accept": "*/*",
"x-requested-with": "XMLHttpRequest",
"sec-ch-ua-mobile": "?0",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.mcdonalds.com/ca/en-ca/restaurant-locator.html",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cookie": "bm_sz=C04645E7F7A956C5F9D9C5A20DEAEC97~YAAQ1Cv2SEtfMBN6AQAAItxfEwwTVV2V2Tr7UWpPt1Ps7gl84FzQlmbWIm4kBBh5dxlK3w8RenwiEiKtvERE6dLmrwPwJUuy+14gU/LeEZvP+uxzyBr04oQXdcSEQuiOgdkAGasqnBrTw1mp5E5iehnRpvHBDdSqh8wRSgJV0eG4f8YwSz66BfntCBALtQNCAFK2; _abck=F05779F2345218EA4989FF467D897C5A~0~YAAQ1Cv2SExfMBN6AQAAItxfEwaIwCrBeP25JBhBb7TX+HmnLQgrj1TkosrB+oHSv9ctrxRukqEDUaHPL1KkjpqjY1XY1yyulQ0ZRhsEfhY968YVsTOqfiosAu3kykd3pJG/bQ37XHwWs5qXpIdhMXRwJwXmkYtl3ETG8kXK2iZ22Q31COaSjNVACLaa7s9tCk9ItgLvUj5x9Nldjnd8AdXR0pXicrQY1IaruJyNqwMcJv42AUHW7iH4Ex9ZOSYsgEjLMNd44mS525X/gSNUTSOzoqoWsnH4MU59vfgLTwc2hVncAv67LBViTLxbWw4eVAvz7Z5phQfCmvoIy0PD8gy5iwPDMaD3GASrK9xScDPAPUI2wquxmSJ+f2cQaxZQKhvJCeH9cz14OZfx8ksA2ss53E0l0kDvgmnw~-1~-1~-1; ak_bmsc=BA4817D8DEE20E92C1E6251C54FC124348F62BD48F5F00005F91C9608B679D5F~plUkbYfsvYr5dCayJ9dMGEJ3QDgkmkv2mLpE7pCY9vW0xrdawvmyxfSnupw/4F7C48Akdn8PKsBniqz+7F+RZb8v4AkvH3c0RuvnynqJoni+kJcDYtPOxdMvdtGdTlZGIkSQNfpcxHNQDVlzojdSBX0vyBh/8seKQv10U67M7m787olYzg9jnsUwk3/VHBrnMDogiWJT8rNV7saSXunN0pAgucZWo/XhCpTJL+tI9urt0=; MCDCountry_code=US; bm_mi=BEE06312635FD442995BC0237BAFDA7C~f/RxgMW/JJSUc/wB9ZRg9fPD/76+wq/TaoWEZR1/ttrAiVTO256xhDTsVYc/kdHIjWkxvfO4XDcBjqe4hQ4qXt8Anpfi09vna/zcC7l6OVWpWeRSoZNztl7h5VF407L3XG+9CpzjSHNcaqAPRk5d0J5gLMtL/KmR8XBkAC0Syim7ST97nxNrPfLdlkSPMGm4Oy86xvY5PH5Nu47zS/gwhanBFg69tAdrQdaZewE2eGuzoJPsZit3UsihTzhXc4LY92hfSdh3/kZRId+NE8Jp0w==; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN4a9yst3R170rBCm1egzGvCBmB1jq9aCwQm5VgIJgloPOdpiIPfD3kDxFbKhqMuS5U=; JSESSIONID=64PZkBXhhpvNjM4NganzSZ0r1npIIaM7Fo84EsxN.eap7node7; _abck=F05779F2345218EA4989FF467D897C5A~-1~YAAQ1Cv2SExyMBN6AQAA5Et0EwZueCejZbKz1VDGCq2sB43Yx4dq0SiiGeUS6gVpXRIdw3rA3OdpNGHq7tVzQ+IvPpEKwLML9736x1qB5SQxV3jai89y2B2QF6K8nKtyrDAes0qbeTyIrHu0Rh1HLs7CjNxiLi0wswbCZfSsPI6fJZiEt+Itre3lfmua/HkhIRwpVTKqlVN5eQ8XIX+s1jJbINx/jUmMTW+jB5k4A5NARGChYH7rJQGYIT/oyZYpSbS3Yweqa4FRgGMW4gYZBN39+t2xSfewADLdpihfOnoZtakw9VhcvAKaf4mEzjB7WEfNJIZSjSE8DzvbJNIF41MGuAhhrnEBwBE8uVCZsA+2qjVPSADVp2Nn8JanJXCbucnLFOLsmPz3oVtGzentht1cHog4+eYOUlmw~0~-1~-1; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN5ZCTzA250oKEeVeXaa6j4gEGJ9RRtrTXQdYXzzSx6fM9aLwif+We2vtIc1yLQgTt4=",
"dnt": "1",
}
response = requests.request(
"GET", url, headers=headers, data=payload, files=files
)
stores = json.loads(response.text)
with open("data.csv", mode="w") as CSVFile:
writer = csv.writer(
CSVFile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
)
writer.writerow(["address", "postcode", "telephone", "has walmart"])
for store in stores["features"]:
has_walmart = "(Wal-Mart)" in store["properties"]["addressLine1"]
address = store["properties"]["addressLine1"].replace("(Wal-Mart)", "")
post_code = store["properties"]["postcode"]
telephone = store["properties"].get("telephone", "N/A")
writer.writerow(
[address, post_code, telephone, "X" if has_walmart else ""]
)
This will create new column has walmart where it has "X" if the address contains (Wal-Mart). Also, it replaces (Wal-Mart) with empty string in address:
import urllib.request
url = ""
http_header = {
"User-Agent": "Mozilla/5.0(compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0) like Gecko",
"Accept": "text/html, application/xhtml+xml, */*",
"Accept-Language": "ko-KR",
"Content-type": "application/x-www-form-urlencoded",
"Host": ""
}
params = {
'id': 'asdgasd',
'call_flag': 'idPoolChk',
'reg_type': 'NY'
}
data = urllib.parse.urlencode(params).encode()
req = urllib.request.Request(url, data)
response = urllib.request.urlopen(req)
the_page = response.read()
print(the_page)
When I run this program, I get:
\xec\x95\x84\xec\x9d\xb4\xeb\x94\x94\xea\xb0\x80 \xec\xa4\x91\xeb\xb3\xb5\xeb\x90\xa9\xeb\x8b\x88\xeb\x8b\xa4
How can I transform this to Korean?
It's UTF-8.
print(the_page.decode('utf-8'))
assuming your console will handle those characters.