I'm using selenium with python, and I'm trying to scrape this page. https://www.vexforum.com/u?period=all. I want to be able to get the data for all 40,000 or so users on this forum, but it only loads 50 initially. You can keep scrolling on the page to load all of the forum's members. Is there any way to request the entire page initially, with all 40k members? Thanks for any help you can provide!
You should use requests (if the robots.txt allow that):
import requests
count = 2
while True:
try:
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Cookie': '_ga=GA1.2.439277064.1611329580; _gat=1; _gid=GA1.2.1557861689.1611329580',
'Referer': 'https://www.vexforum.com/u?period=all',
'Host': 'www.vexforum.com',
'Accept-Language': 'it-it',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'X-CSRF-Token': 'undefined',
'Discourse-Present': 'true',
'X-Requested-With': 'XMLHttpRequest',
}
params = {
'order': 'likes_received',
'page': str(count),
'period': 'all'
}
r = requests.get('https://www.vexforum.com/directory_items?order=likes_received&page=2&period=all', headers=headers, params=params)
print(r.json())
print('\n\n\n')
print('___________________________________________________')
print('\n\n\n')
count +=1
except:
pass
You now have only to parse the json response grab the information you want.
Related
I have an issue with the particular website https://damas.terna.it/DMSPCAS08.
I am trying to either scrape the data or to fetch the excel file that it is included.
I tried to fetch the excel file with a post request.
import requests
from bs4 import BeautifulSoup
import json
import datetime
url = 'https://damas.terna.it/api/Ntc/GetNtc'
headers = {
'Host': 'damas.terna.it',
'Connection': 'keep-alive',
'sec-ch-ua': '" Not A;Brand";v="99", "Chromium";v="90", "Google Chrome";v="90"',
'sec-ch-ua-mobile': '?0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://damas.terna.it/DMSPCAS08',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9',
'Cookie': '__RequestVerificationToken=5mfiSM2dKte532I8fd3MRdn6nnHbSezkQX29r3fyF2tMegXsvnOInpxy8JvFuDyRVS6pZs03y-NL3CsNsItN1yboc128Kk51yEiuUU0mel41; pers_damas_2019=378972352.20480.0000; rxVisitor=1619766836714T8SRPFLUAH62F1R9K6KG3EKK104BSDFE; dtCookie=7$EC34ED7BFB4A503D379D8D8C69242174|846cd19ce7947380|1; rxvt=1619774462088|1619771198468; dtPC=7$369396404_351h-vIDMPHTCURIGKVKBWVVEWOAMRMKDNWCUH-0e1; DamasNetCookie=F-evmYb7kIS_YTMr2mwuClMB1zazemmhl9vzSXynWeuCII_keDb_jQr4VLSYif9t3juDS6LkOuIXKFfe8pydxSzHPfZzGveNB6xryj2Czp9J1qeWFFT9dYFlRXFWAHuaEIyUQQDJmzWfDBrFCWr309mZoE6hkCKzDtoJgIoor9bed1kQgcdeymAH9lrtrKxwsheaQm2qA-vWWqKjCiruO1VkJ6II_bcrAXU2A_ZPQPznE1_8AEC_AwXmBXETubMQwFAnDXsOLDfEYeQ61TGAupF3d-wz3aZfRs5eCA3kw-en-kpEbS0trwFBQzB-098610GIbIPki9ThVitZ2LN2zza6nn1A8qchqfQC_CZEgu6Jt1glfhHceWS6tvWCuyOEqo2jJpxAajMYXPB6mzlHzX13TiV-jgeFSPehugMAgms_exqocw9w27e4lI5laYZu0rkKkznpZ1mJLOhORwny8-bKa3nRUt7erFv7ul3nLLrgd3FP907tHpTh-qXt1Bmr6OqknDZr_EBN8GY_B2YHV-8hC0AjdqQqpS0xOpp7z_CzzgByTOHSNdeKjVgQfZLQ7amnp71lhxgPeJZvOIl_mIWOr_gWRy_iK6UuzrA3udCTV7bAnUXKB8gX89d9ShQf5tZDxPFchrAQBtdmDChQOA; dtLatC=2; dtSa=true%7CC%7C-1%7CExport%20to%20.xls%7C-%7C1619772685174%7C369396404_351%7Chttps%3A%2F%2Fdamas.terna.it%2FDMSPCAS08%7CTerna%20%5Ep%20NTC%7C1619772662568%7C%7C'
}
parameters = {
'busDay': "2021-05-01",
'busDayTill': "2021-05-01",
'borderDirId': '1',
'borderDirName': "TERNA-APG"
}
response = requests.post(url, data=parameters, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")
print(soup.prettify())
I am receiving this error:
The parameters dictionary contains an invalid entry for parameter 'parameters' for method 'System.Web.Mvc.ActionResult GetNtc(Damas.Core.Data.DataSource.Data.ParametersModel)' in 'Terna.Web.Controllers.CapacityManagement.NtcController'. The dictionary contains a value of type 'System.Collections.Generic.Dictionary`2[System.String,System.Object]', but the parameter requires a value of type 'Damas.Core.Data.DataSource.Data.ParametersModel'.
Parameter name: parameters
Please don't post the answer to your question in the question's body; instead, post it in the answer box:
response = requests.post(url, data=json.dumps(parameters), headers=headers) seems to solve the issue.
import requests
import json
import csv
import pandas as pd
import time
from bs4 import BeautifulSoup
from requests import Session
url = 'https://www.agathaparis.com/ajax.V1.php/en_US/Rbs/Storelocator/Store/'
payload={"websiteId":603593,"sectionId":603593,"pageId":868982,"data":{"currentStoreId":0,"distanceUnit":"kilometers","distance":"50kilometers","coordinates":{"latitude":48.856614,"longitude":2.3522219},"commercialSign":0},"dataSets":"coordinates,address,card,allow","URLFormats":"canonical,contextual","visualFormats":"original,listItem","pagination":"0,50","referer":"https://www.agathaparis.com/our-stores.html"}
s=requests.Session()
s.get('https://www.agathaparis.com/our-stores.html')
headers={
'Content-Type': 'application/json',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-gb',
'Host': 'www.agathaparis.com',
'Origin': 'https://www.agathaparis.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
'Connection': 'keep-alive',
'Referer': 'https://www.agathaparis.com/our-stores.html',
'Content-Length': '407',
'Cookie': '_fbp=fb.1.1607609084947.2075070555; _ga=GA1.2.964068958.1607609084; _gid=GA1.2.1470390017.1607868080; _gat_UA-33249847-1=1; rbsWebsiteTrackerHasConsent=true; rbsWebsiteTrackerHasConsentGdpr=%7B%22technical%22%3Atrue%2C%22analytics%22%3Atrue%2C%22advertising%22%3Atrue%7D; PHPSESSID=n4uf5tfuf96k141vemo5s9g99g',
}
resp = s.post(url,data=payload,headers=headers)
I am trying to extract the stores list through this post request. I don't understand what I am missing. Thanks in advance for your help
Your main error is that you post with wrong Content-Type. You need to post JSON instead of application/x-www-form-urlencoded:
headers={
'Content-Type': 'application/json',
'Accept': 'application/json, text/plain, */*',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-gb',
'Host': 'www.agathaparis.com',
'Origin': 'https://www.agathaparis.com',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0.1 Safari/605.1.15',
# 'Connection': 'keep-alive',
'Referer': 'https://www.agathaparis.com/our-stores.html',
'x-http-method-override': 'GET',
# 'Content-Length': '407',
# 'Cookie': '_fbp=fb.1.1607609084947.2075070555; _ga=GA1.2.964068958.1607609084; _gid=GA1.2.1470390017.1607868080; _gat_UA-33249847-1=1; rbsWebsiteTrackerHasConsent=true; rbsWebsiteTrackerHasConsentGdpr=%7B%22technical%22%3Atrue%2C%22analytics%22%3Atrue%2C%22advertising%22%3Atrue%7D; PHPSESSID=n4uf5tfuf96k141vemo5s9g99g',
}
resp = s.post(url, json=payload, headers=headers)
In the following code, I am trying to do POST method to microsoft online account, and I am starting with a page that requires to post an email. This is my try till now
import requests
from bs4 import BeautifulSoup
url = 'https://moe-register.emis.gov.eg/account/login?ReturnUrl=%2Fhome%2FRegistrationForm'
headers ={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.9,ar;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded',
'Cookie':'__RequestVerificationToken=vdS3aPPg5qQ2bH9ADTppeKIVJfclPsMI6dqB6_Ru11-2XJPpLfs7jBlejK3n0PZuYl-CwuM2hmeCsXzjZ4bVfj2HGLs2KOfBUphZHwO9cOQ1; .AspNet.MOEEXAMREGFORM=ekeG7UWLA6OSbT8ZoOBYpC_qYMrBQMi3YOwrPGsZZ_3XCuCsU1BP4uc5QGGE2gMnFgmiDIbkIk_8h9WtTi-P89V7ME6t_mBls6T3uR2jlllCh0Ob-a-a56NaVNIArqBLovUnLGMWioPYazJ9DVHKZY7nR_SvKVKg2kPkn6KffkpzzHOUQAatzQ2FcStZBYNEGcfHF6F9ZkP3VdKKJJM-3hWC8y62kJ-YWD0sKAgAulbKlqcgL1ml6kFoctt2u66eIWNm3ENnMbryh8565aIk3N3UrSd5lBoO-3Qh8jdqPCCq38w3cURRzCd1Z1rhqYb3V2qYs1ULRT1_SyRXFQLrJs5Y9fsMNkuZVeDp_CKfyzM',
'Host': 'moe-register.emis.gov.eg',
'Origin': 'https://moe-register.emis.gov.eg',
'Referer': 'https://moe-register.emis.gov.eg/account/login?ReturnUrl=%2Fhome%2FRegistrationForm',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'}
with requests.session() as s:
# r = s.post(url)
#soup = BeautifulSoup(r.content, 'lxml')
data = {'EmailAddress': '476731809#matrouh1.moe.edu.eg'}
r_post = s.post(url, data=data, headers=headers, verify=False)
soup = BeautifulSoup(r_post.content, 'lxml')
print(soup)
What I got is the same page that requires the post of the email again. I expected to get the page that requires sign-in password..
This is the starting page
and this is an example of the email that needed to be posted 476731809#matrouh1.moe.edu.eg
** I have tried to use such a code but I got the page sign in again (although the credentials are correct)
Can you please try this code
import requests
from bs4 import BeautifulSoup
url = 'https://login.microsoftonline.com/common/login'
s = requests.Session()
res = s.get('https://login.microsoftonline.com')
cookies = dict(res.cookies)
res = s.post(url,
auth=('476731809#matrouh1.moe.edu.eg', 'Std#050202'),
verify=False,
cookies=cookies)
soup = BeautifulSoup(res.text, 'html.parser')
print(soup)
I checked out the page and following seems to be working:
import requests
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'https://moe-register.emis.gov.eg',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_1_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://moe-register.emis.gov.eg/account/login',
'Accept-Language': 'en-US,en;q=0.9,gl;q=0.8,fil;q=0.7,hi;q=0.6',
}
data = {
'EmailAddress': '476731809#matrouh1.moe.edu.eg'
}
response = requests.post('https://moe-register.emis.gov.eg/account/authenticate', headers=headers, data=data, verify=False)
Your POST endpoint seems to be wrong, since you need to re-direct from /login to /authenticate to proceed with the request (I am on a mac so my user-agent may be different than yours/required, you can change that from the headers variable).
I have been suffering from a problem for a day. I want to crawl the website which has pages.
I found that I can crawl it when each pages has different urls
justlike (page=1 . page=2 .. .etc)..
But the website I'm trying to scrape, It never changes its url even though I go to next page .
Is there any ways to scrape this kind of page? Thank you!
the code is the result curl into python
import requests
cookies = {
'WMONID': 'smDC5Ku5TeX',
'userId': 'robin9634',
'UID': 'robin9634',
'JSESSIONID': 'lLqLdHFEk4iEJdQ2HCR5m05tg6ZIxBdegEamDzxeEoTClkvqVDN4xzXeMPtTIN3e.cG9ydGFsX2RvbWFpbi9wZDU=',
}
headers = {
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'Origin': 'https://dhlottery.co.kr',
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https://dhlottery.co.kr/gameInfo.do?method=powerWinNoList',
'Accept-Language': 'ko-KR,ko;q=0.9,en-US;q=0.8,en;q=0.7',
}
params = (
('method', 'powerWinNoList'),
)
data = {
'nowPage': '7',
'searchDate': '20200909',
'calendar': '2020-09-09',
'sortType': 'num'
}
response = requests.post('https://dhlottery.co.kr/gameInfo.do', headers=headers, params=params, cookies=cookies, data=data)
#NB. Original query string below. It seems impossible to parse and
#reproduce query strings 100% accurately so the one below is given
#in case the reproduced version is not "correct".
# response = requests.post('https://dhlottery.co.kr/gameInfo.do?method=powerWinNoList', headers=headers, cookies=cookies, data=data)
Hello I am trying to login to https://www.neighborwho.com using Requests for Python but with my code the website response keeps telling me that it cannot find any user with my username when in fact I can login using normal browser manually. I know I could use a headless browser or maybe lxml or mechanicalsoup etc but I am learning python and requests right now so want to see if it can be done in requests
Here is my code:
import requests
url = 'https://www.neighborwho.com/api/v5/session'
payload = {'user[email]': 'my_username',
'user[password]': 'my_password'}
headers = {'referer': 'https://www.neighborwho.com/app/login',
'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
'origin': 'https://www.neighborwho.com',
'accept': 'application/json, text/javascript, */*; q=0.01',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'en-US,en;q=0.9',
'content-length': '451',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin'
}
s = requests.Session()
resp = s.post(url, data=payload, headers=headers)
print(resp.status_code)
print(resp.text)
Here is the output I am getting:
401
{"session":{"errors":"We do not see an account that matches that
email/password combination. For security reasons we may occasionally reset
passwords. If you have an account that matches the email address
\"my_username\" and need to reset your password, please use the link
below."},"meta":{"status":401}}