Regular expression operations to separate store locations with in Walmart locations - python

I am currently trying to implement a re to my code for Mcdonald locations across Canada. The goal is to add a column to my csv that states if the locations has walmart. All the address in "address1" that are in walmart have a tag that says (walmart). I am hoping to separate it,if anyone can help with that would be great! If there is a way to do it in excel that would be just as good.
import csv
import json
import requests
import re
url = "https://www.mcdonalds.com/googleapps/GoogleRestaurantLocAction.do?method=searchLocation&latitude=43.6936965&longitude=-79.2969938&radius=1000000&maxResults=1700&country=ca&language=en-ca&showClosed=&hours24Text=Open%2024%20hr"
payload = {}
files = {}
headers = {
"authority": "www.mcdonalds.com",
"sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
"accept": "*/*",
"x-requested-with": "XMLHttpRequest",
"sec-ch-ua-mobile": "?0",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.mcdonalds.com/ca/en-ca/restaurant-locator.html",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cookie": "bm_sz=C04645E7F7A956C5F9D9C5A20DEAEC97~YAAQ1Cv2SEtfMBN6AQAAItxfEwwTVV2V2Tr7UWpPt1Ps7gl84FzQlmbWIm4kBBh5dxlK3w8RenwiEiKtvERE6dLmrwPwJUuy+14gU/LeEZvP+uxzyBr04oQXdcSEQuiOgdkAGasqnBrTw1mp5E5iehnRpvHBDdSqh8wRSgJV0eG4f8YwSz66BfntCBALtQNCAFK2; _abck=F05779F2345218EA4989FF467D897C5A~0~YAAQ1Cv2SExfMBN6AQAAItxfEwaIwCrBeP25JBhBb7TX+HmnLQgrj1TkosrB+oHSv9ctrxRukqEDUaHPL1KkjpqjY1XY1yyulQ0ZRhsEfhY968YVsTOqfiosAu3kykd3pJG/bQ37XHwWs5qXpIdhMXRwJwXmkYtl3ETG8kXK2iZ22Q31COaSjNVACLaa7s9tCk9ItgLvUj5x9Nldjnd8AdXR0pXicrQY1IaruJyNqwMcJv42AUHW7iH4Ex9ZOSYsgEjLMNd44mS525X/gSNUTSOzoqoWsnH4MU59vfgLTwc2hVncAv67LBViTLxbWw4eVAvz7Z5phQfCmvoIy0PD8gy5iwPDMaD3GASrK9xScDPAPUI2wquxmSJ+f2cQaxZQKhvJCeH9cz14OZfx8ksA2ss53E0l0kDvgmnw~-1~-1~-1; ak_bmsc=BA4817D8DEE20E92C1E6251C54FC124348F62BD48F5F00005F91C9608B679D5F~plUkbYfsvYr5dCayJ9dMGEJ3QDgkmkv2mLpE7pCY9vW0xrdawvmyxfSnupw/4F7C48Akdn8PKsBniqz+7F+RZb8v4AkvH3c0RuvnynqJoni+kJcDYtPOxdMvdtGdTlZGIkSQNfpcxHNQDVlzojdSBX0vyBh/8seKQv10U67M7m787olYzg9jnsUwk3/VHBrnMDogiWJT8rNV7saSXunN0pAgucZWo/XhCpTJL+tI9urt0=; MCDCountry_code=US; bm_mi=BEE06312635FD442995BC0237BAFDA7C~f/RxgMW/JJSUc/wB9ZRg9fPD/76+wq/TaoWEZR1/ttrAiVTO256xhDTsVYc/kdHIjWkxvfO4XDcBjqe4hQ4qXt8Anpfi09vna/zcC7l6OVWpWeRSoZNztl7h5VF407L3XG+9CpzjSHNcaqAPRk5d0J5gLMtL/KmR8XBkAC0Syim7ST97nxNrPfLdlkSPMGm4Oy86xvY5PH5Nu47zS/gwhanBFg69tAdrQdaZewE2eGuzoJPsZit3UsihTzhXc4LY92hfSdh3/kZRId+NE8Jp0w==; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN4a9yst3R170rBCm1egzGvCBmB1jq9aCwQm5VgIJgloPOdpiIPfD3kDxFbKhqMuS5U=; JSESSIONID=64PZkBXhhpvNjM4NganzSZ0r1npIIaM7Fo84EsxN.eap7node7; _abck=F05779F2345218EA4989FF467D897C5A~-1~YAAQ1Cv2SExyMBN6AQAA5Et0EwZueCejZbKz1VDGCq2sB43Yx4dq0SiiGeUS6gVpXRIdw3rA3OdpNGHq7tVzQ+IvPpEKwLML9736x1qB5SQxV3jai89y2B2QF6K8nKtyrDAes0qbeTyIrHu0Rh1HLs7CjNxiLi0wswbCZfSsPI6fJZiEt+Itre3lfmua/HkhIRwpVTKqlVN5eQ8XIX+s1jJbINx/jUmMTW+jB5k4A5NARGChYH7rJQGYIT/oyZYpSbS3Yweqa4FRgGMW4gYZBN39+t2xSfewADLdpihfOnoZtakw9VhcvAKaf4mEzjB7WEfNJIZSjSE8DzvbJNIF41MGuAhhrnEBwBE8uVCZsA+2qjVPSADVp2Nn8JanJXCbucnLFOLsmPz3oVtGzentht1cHog4+eYOUlmw~0~-1~-1; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN5ZCTzA250oKEeVeXaa6j4gEGJ9RRtrTXQdYXzzSx6fM9aLwif+We2vtIc1yLQgTt4=",
"dnt": "1",
}
response = requests.request(
"GET", url, headers=headers, data=payload, files=files
)
stores = json.loads(response.text)
with open("McDonaldworkshop
\.csv", mode="w") as CSVFile:
writer = csv.writer(
CSVFile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
)
writer.writerow(
[
"address",
"postcode",
"telephone",
]
)
for store in stores["features"]:
address = store["properties"]["addressLine1"]
post_code = store["properties"]["postcode"]
telephone = store["properties"].get("telephone", "N/A")
writer.writerow([address,post_code, telephone])

Try:
import csv
import json
import requests
url = "https://www.mcdonalds.com/googleapps/GoogleRestaurantLocAction.do?method=searchLocation&latitude=43.6936965&longitude=-79.2969938&radius=1000000&maxResults=1700&country=ca&language=en-ca&showClosed=&hours24Text=Open%2024%20hr"
payload = {}
files = {}
headers = {
"authority": "www.mcdonalds.com",
"sec-ch-ua": '" Not;A Brand";v="99", "Google Chrome";v="91", "Chromium";v="91"',
"accept": "*/*",
"x-requested-with": "XMLHttpRequest",
"sec-ch-ua-mobile": "?0",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "cors",
"sec-fetch-dest": "empty",
"referer": "https://www.mcdonalds.com/ca/en-ca/restaurant-locator.html",
"accept-language": "en-GB,en-US;q=0.9,en;q=0.8",
"cookie": "bm_sz=C04645E7F7A956C5F9D9C5A20DEAEC97~YAAQ1Cv2SEtfMBN6AQAAItxfEwwTVV2V2Tr7UWpPt1Ps7gl84FzQlmbWIm4kBBh5dxlK3w8RenwiEiKtvERE6dLmrwPwJUuy+14gU/LeEZvP+uxzyBr04oQXdcSEQuiOgdkAGasqnBrTw1mp5E5iehnRpvHBDdSqh8wRSgJV0eG4f8YwSz66BfntCBALtQNCAFK2; _abck=F05779F2345218EA4989FF467D897C5A~0~YAAQ1Cv2SExfMBN6AQAAItxfEwaIwCrBeP25JBhBb7TX+HmnLQgrj1TkosrB+oHSv9ctrxRukqEDUaHPL1KkjpqjY1XY1yyulQ0ZRhsEfhY968YVsTOqfiosAu3kykd3pJG/bQ37XHwWs5qXpIdhMXRwJwXmkYtl3ETG8kXK2iZ22Q31COaSjNVACLaa7s9tCk9ItgLvUj5x9Nldjnd8AdXR0pXicrQY1IaruJyNqwMcJv42AUHW7iH4Ex9ZOSYsgEjLMNd44mS525X/gSNUTSOzoqoWsnH4MU59vfgLTwc2hVncAv67LBViTLxbWw4eVAvz7Z5phQfCmvoIy0PD8gy5iwPDMaD3GASrK9xScDPAPUI2wquxmSJ+f2cQaxZQKhvJCeH9cz14OZfx8ksA2ss53E0l0kDvgmnw~-1~-1~-1; ak_bmsc=BA4817D8DEE20E92C1E6251C54FC124348F62BD48F5F00005F91C9608B679D5F~plUkbYfsvYr5dCayJ9dMGEJ3QDgkmkv2mLpE7pCY9vW0xrdawvmyxfSnupw/4F7C48Akdn8PKsBniqz+7F+RZb8v4AkvH3c0RuvnynqJoni+kJcDYtPOxdMvdtGdTlZGIkSQNfpcxHNQDVlzojdSBX0vyBh/8seKQv10U67M7m787olYzg9jnsUwk3/VHBrnMDogiWJT8rNV7saSXunN0pAgucZWo/XhCpTJL+tI9urt0=; MCDCountry_code=US; bm_mi=BEE06312635FD442995BC0237BAFDA7C~f/RxgMW/JJSUc/wB9ZRg9fPD/76+wq/TaoWEZR1/ttrAiVTO256xhDTsVYc/kdHIjWkxvfO4XDcBjqe4hQ4qXt8Anpfi09vna/zcC7l6OVWpWeRSoZNztl7h5VF407L3XG+9CpzjSHNcaqAPRk5d0J5gLMtL/KmR8XBkAC0Syim7ST97nxNrPfLdlkSPMGm4Oy86xvY5PH5Nu47zS/gwhanBFg69tAdrQdaZewE2eGuzoJPsZit3UsihTzhXc4LY92hfSdh3/kZRId+NE8Jp0w==; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN4a9yst3R170rBCm1egzGvCBmB1jq9aCwQm5VgIJgloPOdpiIPfD3kDxFbKhqMuS5U=; JSESSIONID=64PZkBXhhpvNjM4NganzSZ0r1npIIaM7Fo84EsxN.eap7node7; _abck=F05779F2345218EA4989FF467D897C5A~-1~YAAQ1Cv2SExyMBN6AQAA5Et0EwZueCejZbKz1VDGCq2sB43Yx4dq0SiiGeUS6gVpXRIdw3rA3OdpNGHq7tVzQ+IvPpEKwLML9736x1qB5SQxV3jai89y2B2QF6K8nKtyrDAes0qbeTyIrHu0Rh1HLs7CjNxiLi0wswbCZfSsPI6fJZiEt+Itre3lfmua/HkhIRwpVTKqlVN5eQ8XIX+s1jJbINx/jUmMTW+jB5k4A5NARGChYH7rJQGYIT/oyZYpSbS3Yweqa4FRgGMW4gYZBN39+t2xSfewADLdpihfOnoZtakw9VhcvAKaf4mEzjB7WEfNJIZSjSE8DzvbJNIF41MGuAhhrnEBwBE8uVCZsA+2qjVPSADVp2Nn8JanJXCbucnLFOLsmPz3oVtGzentht1cHog4+eYOUlmw~0~-1~-1; bm_sv=7CACE3495320A7C0A6CF8F41DFE0EB36~F9KzvznVNk/fE4+ijLD5H/szY7O161rWlemmShElumIW7HN49Gq2d9Sd2tqBjCa9sJOX4zoehAkc8WvsID5Idon/hDlDeLJZuqnEmff4PN5ZCTzA250oKEeVeXaa6j4gEGJ9RRtrTXQdYXzzSx6fM9aLwif+We2vtIc1yLQgTt4=",
"dnt": "1",
}
response = requests.request(
"GET", url, headers=headers, data=payload, files=files
)
stores = json.loads(response.text)
with open("data.csv", mode="w") as CSVFile:
writer = csv.writer(
CSVFile, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL
)
writer.writerow(["address", "postcode", "telephone", "has walmart"])
for store in stores["features"]:
has_walmart = "(Wal-Mart)" in store["properties"]["addressLine1"]
address = store["properties"]["addressLine1"].replace("(Wal-Mart)", "")
post_code = store["properties"]["postcode"]
telephone = store["properties"].get("telephone", "N/A")
writer.writerow(
[address, post_code, telephone, "X" if has_walmart else ""]
)
This will create new column has walmart where it has "X" if the address contains (Wal-Mart). Also, it replaces (Wal-Mart) with empty string in address:

Related

Parse JSON string into Python dictionary

I am trying to parse the JSON string from this page but couldn't make it to convert the JSON into a Python dict via json.loads. Here is my starter code:
import requests
import re
import json
import html
headers = {
"authority": "www.budgetpetproducts.com.au",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-language": "en,ru;q=0.9",
"cache-control": "max-age=0",
# 'cookie': '_ALGOLIA=anonymous-6d2b7032-518b-4da0-b0b9-964e00920f3d; scarab.visitor=%2245DBAC5A87F9C262%22; _fbp=fb.2.1674782859328.577366252; _tt_enable_cookie=1; _ttp=jTkn1Rm860eDgPiXlG45pfHRDG7; _gid=GA1.3.415748101.1675529773; _ga=GA1.3.583864494.1674782855; _uetsid=d505d7e0a4ac11ed962a7de239a71ea1; _uetvid=c75d7e109de111eda4efb3023338d7ad; XSRF-TOKEN=eyJpdiI6IkJjNVpiMzNQRGFRYzJ6Z1ZUR2NEVFE9PSIsInZhbHVlIjoiSGNyN2J4dUtza1JadUhqUFAwWklWcFdIVmZLSC84OEp4TUtRYWdLQzBhUU1GK2psMzFHQW5SVTlZZm1Yd0xmaGt3QWFoRDVsNVYyRGdKYVRKbUZSak9UMzlCZEhXc0FubjdORERraE5nRHNsYWViVkxZZCt6d2VkbGNGNjhNWlciLCJtYWMiOiIyZjQxMzk5YjZjZWNmM2E1MmVjYmQxODAxNDY3ZWY1MTZiM2MyNzcwODBmY2ZlNWM5YTVmMDU4MWMwMDViZjQ5In0%3D; budget_pet_products_session=eyJpdiI6IkdnVHVrTjlkUGY1SGxMN0lZWTVsckE9PSIsInZhbHVlIjoibERhNnllekN5azRhMEptSU9QeGZ3VkVaYUpsUkxGbi9rR21yMXArWEoycWdlMWJSZFRnL3BtOUhBMXBXQ0syVEJPbUtaOVVpLzRkdFBwRDZsUU45V3lxd1JzK2lTU1RyLzkyM24xcTR2TUUvQVdrdEV0VHpoRDFIVFNBZHJTVTgiLCJtYWMiOiIwNDAzMTM3YmQ5YWE3ZWY3NmE3MjA3OGEyMTZmMWM5ODY3ZjlkOGZjNDEyYzkzNmM5MzhjY2ZkODcyNmU3N2NjIn0%3D; scarab.profile=%221677%7C1675608432%7C8787%7C1675608372%7C3624%7C1675146623%22; _ga_6YGE1ZKCTV=GS1.1.1675608368.2.1.1675608533.0.0.0',
"referer": "https://www.budgetpetproducts.com.au/dog/food?sort=best_match",
"sec-ch-ua": '"Not?A_Brand";v="8", "Chromium";v="108", "Yandex";v="23"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Linux"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 YaBrowser/23.1.1.1038 (beta) Yowser/2.5 Safari/537.36",
}
response = requests.get(
"https://www.budgetpetproducts.com.au/product/royal-canin-maxi-adult-dry-dog-food-4kg/1679",
headers=headers,
)
data = (
re.search(":data=(.*)", response.text)
.group(1)
.replace(':is-mobile="false"></product-page-component>', "")
.replace(""", '"')
.replace("'", '\\"')
)
clean_data = html.unescape(data)
json_blob = json.loads(clean_data)
print(json_blob)
Above code is giving JSONDecodeError:
File "/usr/lib/python3.9/json/decoder.py", line 340, in decode
raise JSONDecodeError("Extra data", s, end)
json.decoder.JSONDecodeError: Extra data: line 1 column 4 (char 3)
I check the JSON if it is valid on JSON formatter, and it is valid: https://jsoneditoronline.org/#left=cloud.c7e2f35696094a07a49afba2d18c6ad4
As advised in the comment by Abolfazi Ghaemi change appropriate line to:
json_blob = json.loads(clean_data[1:-2])
in order to get printed:
{'info': {'id': 1679, 'name': {'title': 'Royal Canin Maxi Adult Dry Dog Food 4kg', 'text': 'Royal Canin Maxi Adult Dry Dog Food 4kg', 'icon': None, 'slug': 'royal-canin-maxi-adult-dry-dog-food-4kg'}, 'category': {'html': ['Dog', ...

Receiving hcaptcha error from discord while using 2captcha (works - working code pasted below)

from twocaptcha import TwoCaptcha
import json
import requests
import random
def rand(list):
return random.randrange(0, len(list))
user_agents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246',
'Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1'
]
headers = {
'user-agent': user_agents[rand(user_agents)],
"accept": "*/*",
"authority": "discord.com",
"method": "POST",
"path": "/api/v9/auth/register",
"scheme": "https",
"origin": "discord.com",
"referer": "discord.com/register",
"x-debug-options": "bugReporterEnabled",
"accept-language": "en-US,en;q=0.9",
"connection": "keep-alive",
"content-Type": "application/json",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiRGlzY29yZCBDbGllbnQiLCJyZWxlYXNlX2NoYW5uZWwiOiJzdGFibGUiLCJjbGllbnRfdmVyc2lvbiI6IjEuMC45MDAzIiwib3NfdmVyc2lvbiI6IjEwLjAuMjIwMDAiLCJvc19hcmNoIjoieDY0Iiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiY2xpZW50X2J1aWxkX251bWJlciI6MTA0OTY3LCJjbGllbnRfZXZlbnRfc291cmNlIjpudWxsfQ==",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin"
}
solver = TwoCaptcha('redacted')
try:
result = solver.hcaptcha(
sitekey='4c672d35-0701-42b2-88c3-78380b0db560',
url='https://discord.com/register',
)
except Exception as e:
sys.exit(e)
else:
print(result['code'])
def getfingerprint():
request_url = "https://discord.com/api/v9/experiments?with_guild_experiments=true"
r = requests.get(request_url , headers = {
"user-agent": user_agents[rand(user_agents)],
"x-context-properties": "eyJsb2NhdGlvbiI6Ii9jaGFubmVscy9AbWUifQ==",
"x-super-properties": "eyJvcyI6IldpbmRvd3MiLCJicm93c2VyIjoiQ2hyb21lIiwiZGV2aWNlIjoiIiwic3lzdGVtX2xvY2FsZSI6ImVuLVVTIiwiYnJvd3Nlcl91c2VyX2FnZW50IjoiTW96aWxsYS81LjAgKFdpbmRvd3MgTlQgMTAuMDsgV2luNjQ7IHg2NCkgQXBwbGVXZWJLaXQvNTM3LjM2IChLSFRNTCwgbGlrZSBHZWNrbykgQ2hyb21lLzEwNS4wLjAuMCBTYWZhcmkvNTM3LjM2IiwiYnJvd3Nlcl92ZXJzaW9uIjoiMTA1LjAuMC4wIiwib3NfdmVyc2lvbiI6IjEwIiwicmVmZXJyZXIiOiIiLCJyZWZlcnJpbmdfZG9tYWluIjoiIiwicmVmZXJyZXJfY3VycmVudCI6IiIsInJlZmVycmluZ19kb21haW5fY3VycmVudCI6IiIsInJlbGVhc2VfY2hhbm5lbCI6InN0YWJsZSIsImNsaWVudF9idWlsZF9udW1iZXIiOjE0NTQyOSwiY2xpZW50X2V2ZW50X3NvdXJjZSI6bnVsbH0="
})
if r.status_code == 200:
fingerprint = json.loads(r.text)["fingerprint"]
print(fingerprint)
else:
return r.text
payload = {
'captcha_key': result['code'],
'consent': True,
'date_of_birth': '2001-11-19',
'email': 'enter email',
'fingerprint': getfingerprint(),
'gift_code_sku_id': None,
'invite': None,
'password': 'enter username',
'username': 'enter password'
}
def register():
data = payload
res = requests.post('https://discord.com/api/v9/auth/register', headers = headers, json = data)
print('Token: ' + res.text)
if __name__ == '__main__':
register()
Code now works and generates accounts
just have to enter your own emails, username and password
can tell when it works cuz it outputs a token
if you get an error mentioning invalid hcaptcha response change email and use a vpn and try again
if you do decide to loop this have it loop slowly or setup proxies

Python: Replace Python text - CURL Python

I need the variable defined as NUEMRODEDNI to be tempered, how is it done?
Excuse me I'm a newbie
I don't know how to use a variable defined in python, I need it to be replaced as shown in the image
import string
import requests
from requests.structures import CaseInsensitiveDict
url = "www.url.com:8022/SistemaIdentificacion/servlet/com.personas.consultapersona?030bf8cfcd4bfccbd543df61b1b43f67,gx-no-cache=1648440596691"
NUEMRODEDNI = "41087712"
headers = CaseInsensitiveDict()
headers["User-Agent"] = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0"
headers["Accept"] = "*/*"
headers["Accept-Language"] = "es-AR,es;q=0.8,en-US;q=0.5,en;q=0.3"
headers["Accept-Encoding"] = "gzip, deflate"
headers["GxAjaxRequest"] = "1"
headers["Content-Type"] = "application/json"
headers["AJAX_SECURITY_TOKEN"] = "a6da9873adb..."
headers["X-GXAUTH-TOKEN"] = "eyJ0eXAiOiJ..."
headers["Origin"] = "http://www.url.com"
headers["Connection"] = "keep-alive"
headers["Referer"] = "www.url.com/SistemaIdentificacion/servlet/com.personas.consultapersona"
headers["Cookie"] = "GX_CLIENT_ID=0496f100-9e4e-4e36-a68d-ba3770ee2bff; GX_SESSION_ID=KUqyHU%2FZbpu96sYlj7Gry8bCYpV6CaSgVk0BLxVCpAU%3D; JSESSIONID=1812E6AC00940BDB325EF9592CB93FF8; GxTZOffset=America/Argentina/Buenos_Aires"
data = '{"MPage":false,"cmpCtx":"","parms":[EDIT HERE,{"s":"M","v":[["","(None)"],["F","Femenino"],["M","Masculino"]]},"M",{"User":"","CompanyCode":0,"Profile":"","UsrOneLoginID":"6647","Depid":1,"UsrLP":6488,"unidad":"","unidadid":"68","IMFParteCuerpo":"","denunciasid":0,"destino":"68","TipoPersona":"","NombreArchivo":"","denorigen":"","macdestinoscodorganigrama":""}],"hsh":[],"objClass":"consultapersona","pkgName":"com.personas","events":["ENTER"],"grids":{}}'
resp = requests.post(url, headers=headers, data=data)
print(resp.content)
Please provide your code typed out rather than a screenshot of it, so that we can simply copy & run it on our end.
Nontheless you should try:
NUMERODNI = "41087712"
data = '{"MPage":false,"cmpCtx":"","parms":[EDIT HERE,{"s":"M","v":[["","(None)"],["F","Femenino"],["M","Masculino"]]},"M",{"User":"","CompanyCode":0,"Profile":"","UsrOneLoginID":"6647","Depid":1,"UsrLP":6488,"unidad":"","unidadid":"68","IMFParteCuerpo":"","denunciasid":0,"destino":"68","TipoPersona":"","NombreArchivo":"","denorigen":"","macdestinoscodorganigrama":""}],"hsh":[],"objClass":"consultapersona","pkgName":"com.personas","events":["ENTER"],"grids":{}}'
data = data.replace("EDIT HERE", NUMERODNI)
print(data) # '{... "parms":[41087712, ...}'
This solution definetly delivers the desired string as result.
If your code still does not do what you would like it to, then the actual issue has to be somewhere else.
Since you're POSTing JSON data, it's easier to just keep your data as a Python dict and tell Requests to JSON-ify it (requests.post(json=...)).
That way you don't need string substitution, just use the variable.
I also took the opportunity to make your headers construction shorter – it can just be a dict.
import requests
url = "www.url.com:8022/SistemaIdentificacion/servlet/com.personas.consultapersona?030bf8cfcd4bfccbd543df61b1b43f67,gx-no-cache=1648440596691"
NUEMRODEDNI = "41087712"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:98.0) Gecko/20100101 Firefox/98.0",
"Accept": "*/*",
"Accept-Language": "es-AR,es;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate",
"GxAjaxRequest": "1",
"Content-Type": "application/json",
"AJAX_SECURITY_TOKEN": "a6da9873adb...",
"X-GXAUTH-TOKEN": "eyJ0eXAiOiJ...",
"Origin": "http://www.url.com",
"Connection": "keep-alive",
"Referer": "www.url.com/SistemaIdentificacion/servlet/com.personas.consultapersona",
"Cookie": "GX_CLIENT_ID=0496f100-9e4e-4e36-a68d-ba3770ee2bff; GX_SESSION_ID=KUqyHU%2FZbpu96sYlj7Gry8bCYpV6CaSgVk0BLxVCpAU%3D; JSESSIONID=1812E6AC00940BDB325EF9592CB93FF8; GxTZOffset=America/Argentina/Buenos_Aires",
}
data = {
"MPage": False,
"cmpCtx": "",
"parms": [
NUEMRODEDNI,
{"s": "M", "v": [["", "(None)"], ["F", "Femenino"], ["M", "Masculino"]]},
"M",
{
"User": "",
"CompanyCode": 0,
"Profile": "",
"UsrOneLoginID": "6647",
"Depid": 1,
"UsrLP": 6488,
"unidad": "",
"unidadid": "68",
"IMFParteCuerpo": "",
"denunciasid": 0,
"destino": "68",
"TipoPersona": "",
"NombreArchivo": "",
"denorigen": "",
"macdestinoscodorganigrama": "",
},
],
"hsh": [],
"objClass": "consultapersona",
"pkgName": "com.personas",
"events": ["ENTER"],
"grids": {},
}
resp = requests.post(url, headers=headers, json=data)
resp.raise_for_status()
print(resp.content)

Web scraping - his application is out of date, please click the refresh button on your browser return message

I would like to scrape some data from the following web site: https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract
Steps I would like to automate are:
Choose "Opcinski sud/ZK odjel". For example choose "Zemljišnoknjižni odjel Benkovac".
Choose "Glavna knjiga". For example choose "BENKOVAC"
Enter "Broj kat. čestice:". For example, enter 576/2.
Select "Da" in "Povijesni pregled" (the last row, leave "Broj ZK uloska empty").
Click "Pregledaj" and solve the captcha.
Scrape html that appers.
I have tried to follow above steps using plain requests in python by following network, after opening inspector in the web browser.
There are lots of requests on the page. I will divide my code in several steps:
Start session and make requests that on the start of the page
import requests
import re
import shutil
from twocaptcha import TwoCaptcha
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json
import uuid
# start session
url = 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract'
session = requests.Session()
session.get(url)
jid = session.cookies.get_dict()['JSESSIONID']
# some requests on the start of the page (probabbly redundandt)
headers = {
'Referer': 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
session.get("https://oss.uredjenazemlja.hr/public/js/libs/modernizr-2.5.3.min.js", headers = headers) #
session.get("https://oss.uredjenazemlja.hr/public/js/libs/jquery-1.7.1.min.js", headers = headers) #
session.get("https://oss.uredjenazemlja.hr/public/js/script.js", headers = headers) # script.json
# no cache json
headers = {
'Cookie': 'ossprivatelang=hr_HR; gxtTheme=m%3Aid%7Cs%3Agray%2Cfile%7Cs%3Axtheme-gray.css; JSESSIONID=' + jid,
"Connection": "keep-alive",
'Host': 'oss.uredjenazemlja.hr',
'Referer': 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
"sec-ch-ua": '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"Sec-Fetch-Dest": "script",
"Sec-Fetch-Mode": "no-cors",
"Sec-Fetch-Site": "same-origin",
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
res = session.get('https://oss.uredjenazemlja.hr/public/gwt/hr.ericsson.oss.ui.pia.OssPiaModule.nocache.js', headers = headers)
cache_html = re.findall(r'bc=\'(.*\.cache.html)\',C', res.text)[0]
# cache_html = "1F6C776DEF6D55F56C900B938F84D726.cache.html"
# some more requests on the start of the page (probabbly redundandt)
headers = {
'Referer': 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36',
}
session.get("https://oss.uredjenazemlja.hr/public/gwt/tiny_mce_editor/tiny_mce_src.js", headers = headers) # tiny_mce_src.js
session.get("https://oss.uredjenazemlja.hr/public/gwt/js/common.js", headers = headers)
session.get("https://oss.uredjenazemlja.hr/public/gwt/js/blueimp_tmpl.js", headers = headers) # blueimp_tmpl.js
# cache json
headers = {
"DNT": "1",
'Referer': 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
"sec-ch-ua": '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'Sec-Fetch-Dest': 'iframe',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
session.get('https://oss.uredjenazemlja.hr/public/gwt/' + cache_html, headers = headers)
Then, I made requests for steps 1 and 2 above:
# commonRPCService opcinski sud 1
headers = {
'Accept': '*/*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'hr-HR,hr;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
# 'Content-Length': '166',
'Content-Type': 'text/x-gwt-rpc; charset=UTF-8',
'Cookie': 'gxtTheme=m%3Aid%7Cs%3Agray%2Cfile%7Cs%3Axtheme-gray.css; ossprivatelang=hr_HR; __utma=79801043.802441445.1616788486.1616788486.1616788486.1; __utmz=79801043.1616788486.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided); x-auto-31=m%3Acollapsed%7Cb%3Atrue; JSESSIONID=' + jid,
"DNT": "1",
'Host': 'oss.uredjenazemlja.hr',
'Origin': 'https://oss.uredjenazemlja.hr',
'Referer': 'https://oss.uredjenazemlja.hr/public/gwt/' + cache_html,
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
payload = '5|0|4|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|getMainBook|1|2|3|4|0|'
res = session.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
headers = headers,
data=payload
)
print(res.text)
# commonRPCService opcinski sud 2
payload = '5|0|18|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|getLrInstitutions|com.extjs.gxt.ui.client.data.BaseModel|hr.ericsson.oss.ui.common.client.core.data.RpcModel/2891266824|dirty|java.lang.Boolean/476441737|new|deleted|resourceCode|java.lang.Integer/3438268394|elementSelected|class|java.lang.String/2004016611|hr.ericsson.jis.domain.admin.Institution|name||1|2|3|4|1|5|6|7|7|8|0|9|-2|10|-2|11|12|0|13|-2|14|15|16|17|15|18|'
res = session.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
data=payload,
headers=headers
)
# print(res.text)
# commonRPCService glavna knjiga 1
payload = '5|0|4|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|getMainBook|1|2|3|4|0|'
res = session.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
data=payload,
headers=headers
)
print(res.text)
# commonRPCService glavna knjiga 2
payload = ('5|0|34|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|getMainBooks|com.extjs.gxt.ui.client.data.BaseModel|hr.ericsson.oss.ui.common.client.core.data.RpcModel/2891266824|dirty|java.lang.Boolean/476441737|new|deleted|resourceCode|java.lang.Integer/3438268394|elementSelected|cadastralMunicipality|class|java.lang.String/2004016611|hr.ericsson.jis.domain.admin.CadastralMunicipality|hr.ericsson.jis.domain.admin.MainBook|institution|institutionId|parentInstitution|name|Općinski sud u Zadru|hr.ericsson.jis.domain.admin.Institution|institutionType|institutionTypeId|hr.ericsson.jis.domain.admin.InstitutionType|source|superviseInstitutionId|Zemljišnoknjižni odjel Benkovac|place|BENKOVAC|preconditionsRequired||1|2|3|4|1|5|6|10|7|8|0|9|-2|10|-2|11|12|0|13|-2|14|6|1|15|16|17|15|16|18|19|6|13|7|-2|9|-2|20|12|500|21|6|8|7|-2|9|-2|10|-2|20|12|605|11|12|0|13|-2|22|16|23|15|16|24|25|6|7|7|-2|9|-2|10|-2|26|12|14|11|-11|13|-2|15|16|27|28|12|1|10|-2|29|-10|11|-11|13|-2|22|16|30|31|16|32|15|-13|33|-2|22|16|34|').encode("utf-8")
res = session.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
data=payload,
headers=headers
)
Than I solve the captcha:
# some captcha post
payload = ('5|0|4|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|isCaptchaDisabled|1|2|3|4|0|').encode('utf-8')
res = session.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
data=payload,
headers=headers
)
print(res.text)
# get and save captcha
TWO_CAPTCHA_APY_KEY = "myapikey"
solver = TwoCaptcha(TWO_CAPTCHA_APY_KEY)
save_path = 'D:/zkrh/captchas'
p = session.get('https://oss.uredjenazemlja.hr/servlets/kaptcha.jpg?1617088523212',
headers=headers,
stream=True)
captcha_path = os.path.join(Path(save_path), 'captcha' + ".jpg")
with open(captcha_path, 'wb') as out_file:
shutil.copyfileobj(p.raw, out_file)
# solve captcha
result = solver.normal(captcha_path, minLength=5, maxLength=5)
payload = ('5|0|6|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|validateCaptcha|java.lang.String|' +
result['code'] + '|1|2|3|4|1|5|6|').encode('utf-8')
res = requests.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
data=payload,
headers=headers
)
if res.text.startswith("//OK"):
os.rename(captcha_path, os.path.join(Path(save_path), result['code'] + ".jpg"))
else:
print("Kriva captcha. Rijesi!")
Now, here is the most important request and I can't get the right output from it. It should return lots of numbers where the most important number is one with 7 digits (\d{7}. the should be 1 or more of such numbers). I can use that number in the last step, to get html Here is my try:
payload = ('5|0|40|https://oss.uredjenazemlja.hr/public/gwt/|0EAC9F40996251FDB21FF254E1600E83|hr.ericsson.oss.ui.pia.client.rpc.IOssPublicRPCService|getLrUnitsByMainBookAndParcel|com.extjs.gxt.ui.client.data.BaseModel|java.lang.String|hr.ericsson.oss.ui.common.client.core.data.RpcModel/2891266824|date|java.sql.Date/3996530531|dirty|java.lang.Boolean/476441737|new|cadastralMunicipality|id|java.lang.Integer/3438268394|class|java.lang.String/2004016611|hr.ericsson.jis.domain.admin.CadastralMunicipality|cadastralMunicipalityId|source|creationDate|formatedName|BENKOVAC|userId|cadInstitution|deleted|institutionId|resourceCode|elementSelected|name|Odjel za katastar nekretnina Benkovac|hr.ericsson.jis.domain.admin.Institution|institution|Zemljišnoknjižni odjel Benkovac|place|sidMainBook|java.lang.Long/4227064769|hr.ericsson.jis.domain.admin.MainBook|status|576/2|1|2|3|4|2|5|6|7|18|8|9|115|10|21|10|11|0|12|-3|13|7|3|14|15|98|16|17|18|19|-5|20|15|1|21|9|116|0|1|22|17|23|24|15|-9999|25|7|8|10|-3|12|-3|26|-3|27|15|117|28|15|0|29|-3|30|17|31|16|17|32|33|7|9|10|-3|12|-3|26|-3|27|15|500|28|-13|29|-3|30|17|34|35|-9|16|-15|26|-3|28|15|0|29|-3|30|-9|36|37|4730091|0|14|15|30857|16|17|38|39|-19|40|').encode('utf-8')
res = session.post(
'https://oss.uredjenazemlja.hr/rpc/commonRPCService',
data=payload,
headers=headers
)
print(res.text)
It returns:
"//EX[2,1,["com.google.gwt.user.client.rpc.IncompatibleRemoteServiceException/3936916533","This application is out of date, please click the refresh button on your browser. ( Blocked attempt to access interface 'hr.ericsson.oss.ui.pia.client.rpc.IOssPublicRPCService', which is not implemented by 'hr.ericsson.oss.ui.common.server.core.rpc.CommonRPCService'; this is either misconfiguration or a hack attempt )"],0,5]"
instead of numbers as I explained before.
Then, in the last step, I should use 7 digit number as lrUnitNumber parameter
# Publicreportservlet
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'hr-HR,hr;q=0.9,en-US;q=0.8,en;q=0.7',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '169',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie': 'ossprivatelang=hr_HR; gxtTheme=m%3Aid%7Cs%3Agray%2Cfile%7Cs%3Axtheme-gray.css; JSESSIONID=' + jid,
'Host': 'oss.uredjenazemlja.hr',
'Origin': 'https://oss.uredjenazemlja.hr',
'Referer': 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
'Sec-Fetch-Dest': 'iframe',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}
dataFrom = {
'pia': 1,
'report_type_id': 4,
'report_type_name': 'bzp_izvadak_oss',
'source': 1,
'institutionID': 500,
'mainBookId': 30857,
'lrUnitNumber': 5509665,
'lrunitID': 5799992,
'status': '0,1',
'footer': '',
'export_type': 'html'
}
res = session.post(
'https://oss.uredjenazemlja.hr/servlets/PublicReportServlet',
data=dataFrom,
headers=headers
)
res
I am providing the R ode too. Maybe someone with R and web scraping knowledge can help:
library(httr)
library(rvest)
library(stringr)
library(reticulate)
twocaptcha <- reticulate::import("twocaptcha")
# captcha python library
TWO_CAPTCHA_APY_KEY = ".."
solver = twocaptcha$TwoCaptcha(TWO_CAPTCHA_APY_KEY)
#
url = 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract'
session = GET(url)
jid <- cookies(session)$value
headers_cache = c(
'Referer'= 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
'User-Agent'= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
)
session <- rvest:::request_GET(content(session), 'https://oss.uredjenazemlja.hr/public/gwt/hr.ericsson.oss.ui.pia.OssPiaModule.nocache.js',
add_headers(headers_cache))
cache_html <- str_extract(session$response, "bc=\\'(.*\\.cache.html)\\',C")
cache_html <- gsub(".*=\\'|\\'.C", "", cache_html)
headers_cache = c(
'Referer'= 'https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract',
'User-Agent'= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
)
session <- rvest:::request_GET(session, paste0('https://oss.uredjenazemlja.hr/public/gwt/', cache_html), add_headers(headers_cache))
# meta
commonRPCServiceUrl <- "https://oss.uredjenazemlja.hr/rpc/commonRPCService"
headers = c(
'Accept'= '*/*',
'Accept-Encoding'= 'gzip, deflate, br',
'Accept-Language'= 'hr-HR,hr;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection'= 'keep-alive',
# 'Content-Length'= '166',
'Content-Type'= 'text/x-gwt-rpc; charset=UTF-8',
'Cookie'= paste0('gxtTheme=m%3Aid%7Cs%3Agray%2Cfile%7Cs%3Axtheme-gray.css; ossprivatelang=hr_HR; x-auto-31=m%3Acollapsed%7Cb%3Atrue; JSESSIONID=', jid),
'Host'= 'oss.uredjenazemlja.hr',
'Origin'= 'https://oss.uredjenazemlja.hr',
'Referer'= paste0('https://oss.uredjenazemlja.hr/public/gwt/', cache_html),
'Sec-Fetch-Dest'= 'empty',
'Sec-Fetch-Mode'= 'cors',
'Sec-Fetch-Site'= 'same-origin',
'User-Agent'= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
)
payload <- "5|0|4|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|getMainBook|1|2|3|4|0|"
session <- rvest:::request_POST(session, commonRPCServiceUrl, body = payload, add_headers(headers))
session$response$content
readBin(session$response$content, character(), endian = "little")
payload <- "5|0|22|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|getMainBooks|com.extjs.gxt.ui.client.data.BaseModel|hr.ericsson.oss.ui.common.client.core.data.RpcModel/2891266824|dirty|java.lang.Boolean/476441737|new|deleted|resourceCode|java.lang.Integer/3438268394|elementSelected|cadastralMunicipality|class|java.lang.String/2004016611|hr.ericsson.jis.domain.admin.CadastralMunicipality|hr.ericsson.jis.domain.admin.MainBook|institution|preconditionsRequired|name|VELIKA GORICA|1|2|3|4|1|5|6|10|7|8|0|9|-2|10|-2|11|12|0|13|-2|14|6|1|15|16|17|15|16|18|19|0|20|-2|21|16|22|"
session <- rvest:::request_POST(session, commonRPCServiceUrl, body = payload, add_headers(headers))
session$response$content
readBin(session$response$content, character(), endian = "little")
# captcha
payload <- "5|0|4|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|isCaptchaDisabled|1|2|3|4|0|"
session <- rvest:::request_POST(session, commonRPCServiceUrl, body = payload, add_headers(headers))
session$response$content
readBin(session$response$content, character(), endian = "little")
headers_captcha <- c(
"Accept"= "image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8",
"Accept-Encoding"= "gzip, deflate, br",
"Accept-Language"=" hr-HR,hr;q=0.9,en-US;q=0.8,en;q=0.7",
"Connection"= "keep-alive",
"Cookie"= paste0("gxtTheme=m%3Aid%7Cs%3Agray%2Cfile%7Cs%3Axtheme-gray.css; ossprivatelang=hr_HR; x-auto-31=m%3Acollapsed%7Cb%3Atrue; JSESSIONID=", jid),
"DNT"= "1",
"Host"= "oss.uredjenazemlja.hr",
"Referer"= "https://oss.uredjenazemlja.hr/public/lrServices.jsp?action=publicLdbExtract",
"sec-ch-ua"= '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
"sec-ch-ua-mobile"= "?0",
"Sec-Fetch-Dest"= "image",
"Sec-Fetch-Mode"= "no-cors",
"Sec-Fetch-Site"= "same-origin",
"User-Agent"= "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
)
captcha <- GET("https://oss.uredjenazemlja.hr/servlets/kaptcha.jpg?1617286122160", add_headers(headers_captcha))
# session <- rvest:::request_GET(session, "https://oss.uredjenazemlja.hr/servlets/kaptcha.jpg?1617286122160", add_headers(headers_captcha))
captcha$content
captcha$response$content
writeBin(captcha$content, "D:/zkrh/captchas/test.jpg")
result = solver$normal("D:/zkrh/captchas/test.jpg", minLength=5, maxLength=5)
payload <- paste0("5|0|6|https://oss.uredjenazemlja.hr/public/gwt/|957F3F03E95E97ABBDE314DFFCCEF4BC|hr.ericsson.oss.ui.common.client.core.rpc.ICommonRPCService|validateCaptcha|java.lang.String|",
result$code, "|1|2|3|4|1|5|6|")
session <- rvest:::request_POST(session, commonRPCServiceUrl, body = payload, add_headers(headers))
session$response$content
readBin(p$response$content, character(), endian = "little")
# ID!!!!!!
headers = c(
'Accept'= '*/*',
'Accept-Encoding'= 'gzip, deflate, br',
'Accept-Language'= 'hr-HR,hr;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection'= 'keep-alive',
# 'Content-Length'= '166',
'Content-Type'= 'text/x-gwt-rpc; charset=UTF-8',
'Cookie'= paste0('gxtTheme=m%3Aid%7Cs%3Agray%2Cfile%7Cs%3Axtheme-gray.css; ossprivatelang=hr_HR; x-auto-31=m%3Acollapsed%7Cb%3Atrue; JSESSIONID=', jid),
'DNT' = '1',
'Host'= 'oss.uredjenazemlja.hr',
'Origin'= 'https://oss.uredjenazemlja.hr',
'Referer'= paste0('https://oss.uredjenazemlja.hr/public/gwt/', cache_html),
'sec-ch-ua' = '"Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99"',
'sec-ch-ua-mobile' = "?0",
'Sec-Fetch-Dest'= 'empty',
'Sec-Fetch-Mode'= 'cors',
'Sec-Fetch-Site'= 'same-origin',
'User-Agent'= 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
)
payload <- paste0("5|0|40|https://oss.uredjenazemlja.hr/public/gwt/|0EAC9F40996251FDB21FF254E1600E83|hr.ericsson.oss.ui.pia.client.rpc.IOssPublicRPCService|getLrUnitByMainBook|com.extjs.gxt.ui.client.data.BaseModel|java.lang.String|hr.ericsson.oss.ui.common.client.core.data.RpcModel/2891266824|date|java.sql.Date/3996530531|dirty|java.lang.Boolean/476441737|new|cadastralMunicipality|id|java.lang.Integer/3438268394|class|java.lang.String/2004016611|hr.ericsson.jis.domain.admin.CadastralMunicipality|cadastralMunicipalityId|source|creationDate|formatedName|VELIKA GORICA|userId|cadInstitution|deleted|institutionId|resourceCode|elementSelected|name|Odjel za katastar nekretnina Velika Gorica|hr.ericsson.jis.domain.admin.Institution|institution|Zemljišnoknjižni odjel Velika Gorica|place|sidMainBook|java.lang.Long/4227064769|hr.ericsson.jis.domain.admin.MainBook|status|1|1|2|3|4|2|5|6|7|18|8|9|114|1|21|10|11|0|12|-3|13|7|3|14|15|102844|16|17|18|19|-5|20|15|1|21|9|116|0|1|22|17|23|24|15|-20|25|7|8|10|-3|12|-3|26|-3|27|15|32|28|15|0|29|-3|30|17|31|16|17|32|33|7|9|10|-3|12|-3|26|-3|27|15|277|28|-13|29|-3|30|17|34|35|-9|16|-15|26|-3|28|-7|29|-3|30|-9|36|37|286610893|17179869184|14|15|21921|16|17|38|39|15|0|40|")
# Encoding(payload) <- "UTF-8"
# payload <- RCurl::curlEscape(payload)
session <- rvest:::request_POST(session, commonRPCServiceUrl, body = payload, add_headers(headers))
session$response$content
readBin(session$response$content, character())
I have found the error. The problem was in wrong url argument in one request.
You should really look into selenium and beautifulsoup4 to automate this - it's like requests on steroids.
see an example on my github: https://github.com/stevenhurwitt/reliant-scrape/blob/master/reliant_scrape.py

Wizzair scraping

I'm trying to scrape WizzAir for personal use. Can't understand what's wrong with my code. Could it be incorrect payload object or cookies?
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
"Accept": "application/json, text/plain, */*",
"Accept-Encoding": "gzip, deflate, sdch, br",
"Accept-Language": "en-US,en;q=0.8,lt;q=0.6,ru;q=0.4",
"Origin": "https://wizzair.com",
"Referer": "https://wizzair.com/"
}
search_url = "https://wizzair.com/lt-LT/FlightSearch"
session = requests.Session()
r = session.get("https://be.wizzair.com/3.8.2/Api/asset/yellowRibbon", headers=headers, allow_redirects=False)
session_id = r.cookies["ASP.NET_SessionId"]
cookies = {
"ASP.NET_SessionId": session_id,
"HomePageSelector": "FlightSearch",
}
# wizz_url = "https://be.wizzair.com/3.8.2/Api/search/search"
wizz_url = "https://be.wizzair.com/3.8.2/Api/asset/farechart"
payload = {"flightList":[{"departureStation":"VNO","arrivalStation":"FCO","departureDate":"2017-02-20"}],"adultCount":1,"childCount":0,"infantCount":0,"wdc":True, "dayInterval":3}
r = session.post(url=wizz_url,data=payload,headers=headers, cookies=cookies)
print r.content
>>> {"validationCodes":["FlightCount_MustBe_OneOrTwo"]}
I run this - even without session and cookies - and get some data.
You have to send it as JSON - using json=payload
import requests
payload = {
"flightList":[
{
"departureStation": "VNO",
"arrivalStation": "FCO",
"departureDate": "2017-02-20"
}
],
"adultCount": 1,
"childCount": 0,
"infantCount": 0,
"wdc": True,
"dayInterval": 3
}
url = 'https://be.wizzair.com/3.8.2/Api/search/search'
r = requests.post(url, json=payload)
print(r.text)
data = r.json()
print(data['outboundFlights'][0]['flightNumber'])
If you will have to use cookies and headers then use Session and then you don't have to copy cookies and headers from one request to another.
import requests
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36",
#"Accept": "application/json, text/plain, */*",
#"Accept-Encoding": "gzip, deflate, sdch, br",
#"Accept-Language": "en-US,en;q=0.8,lt;q=0.6,ru;q=0.4",
}
s = requests.Session()
s.headers.update(headers)
# to get cookies
r = s.get("https://www.wizzair.com/")
payload = {
"flightList":[
{
"departureStation": "VNO",
"arrivalStation": "FCO",
"departureDate": "2017-02-20"
}
],
"adultCount": 1,
"childCount": 0,
"infantCount": 0,
"wdc": True,
"dayInterval": 3
}
url = 'https://be.wizzair.com/3.8.2/Api/search/search'
r = s.post(url, json=payload)
print(r.text)
data = r.json()
print(data['outboundFlights'][0]['flightNumber'])

Categories

Resources