How would I use multithreading an api request like such - python

I created a username checker against the Ubisoft api. However the requests are fairly slow so I wanted to speed it up, and one thing I thought of was multithreading. I know about pools and such but I've got no clue how to use it in an api request like here.
def check():
global checkedCount
global availableCount
headers = {
'Method':'GET',
'Authority':'public-ubiservices.ubi.com',
'referer':'https://lb-prod-acc_ount-pdc.ubisoft.com',
'Ubi-AppId':'c5393f10-7ac7-4b4f-90fa-21f8f3451a04',
'Authorization': authToken,
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'Ubi-RequestedPlatformType':'uplay'}
for name in usernameList:
r = requests.get("https://public-ubiservices.ubi.com/v3/profiles?nameOnPlatform=" + name + "&platformType=uplay", headers=headers)
while r.status_code != 200: #retry on rate limit
r = requests.get("https://public-ubiservices.ubi.com/v3/profiles?nameOnPlatform=" + name + "&platformType=uplay", headers=headers)
if not r.json()['profiles']:
availableCount += 1
checkedCount += 1
print(f"{Fore.CYAN}[$]{Fore.RESET} {name} is available")
else:
checkedCount += 1
print(f"{Fore.CYAN}[$]{Fore.RESET} {name} is unavailable")
Don't say it's a duplicate question. Because I'm not trying to use multiple url's like other questions.

Related

I am trying to navigate through the pages of a website and scrape its links but the same page data is scraped even after changing page number

from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1
This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]

#Python Web_Scraping_Linkedin_ User Incomplete result

Please I need some support from you all. It’s a practice python code used for scarping employee information/user URL from Linkedin, this code can currently only print those user name and their current position, however, the showing result is also incomplete (Some of them are just their name without the role in the company) In the end, the user URL could not printenter image description here out.
import random
import argparse
import requests
import re
parser = argparse.ArgumentParser(description='Searches Google For Linkedin Profiles')
parser.add_argument('--keyword', type=str, help='keywords to search')
parser.add_argument('--limit', type=int, help='how many profiles to scrape')
args = parser.parse_args()
class LinkedinScraper(object):
def __init__(self, keyword, limit):
#:param keyword: a str of keyword(s) to search for
#:param limit: number of profiles to scrape
self.keyword = keyword.replace(' ', '%20')
self.all_htmls = ""
self.server = 'www.google.com'
self.quantity = '100'
self.limit = int(limit)
self.counter = 0
def search(self):
#perform the search
#:return: a list of htmls from Google Searches
# choose a random user agent
user_agents = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.142 Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Windows NT 5.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00'
]
while self.counter < self.limit:
headers = {'User-Agent': random.choice(user_agents)}
url = 'http://google.com/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.keyword
resp = requests.get(url, headers=headers)
if ("Our systems have detected unusual traffic from your computer network.") in resp.text:
print("Running into captchas")
return
self.all_htmls += resp.text
self.counter += 10
def parse_links(self):
reg_links = re.compile("url=https://www.linkedin.com(.*?)&")
self.temp = reg_links.findall(self.all_htmls)
results = []
for regex in self.temp:
final_url = regex.replace("url= ", "")
results.append("https://www.linkedin.com" + final_url)
return results
def parse_people(self):
# :param html: parse the html for Linkedin Profiles using regex
# :return: a list of
reg_people = re.compile(r'>[a-zA-Z0-9._ -]* -|\| LinkedIn')
self.temp = reg_people.findall(self.all_htmls)
print(self.temp)
results = []
for iteration in (self.temp):
delete = iteration.replace(' | LinkedIn', '')
delete = delete.replace(' - LinkedIn', '')
delete = delete.replace(' profiles ', '')
delete = delete.replace('LinkedIn', '')
delete = delete.replace('|', '')
delete = delete.replace('"', '')
delete = delete.replace('>', '')
delete = delete.strip("-")
if delete != " ":
results.append(delete)
return results
if __name__ == "__main__":
ls = LinkedinScraper(keyword="Tesla",limit=100)
ls.search()
links = ls.parse_links()
print(links)
profiles = ls.parse_people()
print(*profiles,sep="\n")

Beautiful soup web scraping returning None-Python

I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".

Python/Json Code - Where was this variable located

This code is from a separate submission.
If you look at the lines :
each[AuthorString]
each[Title]
im wondering where the user got these variables from?
I navigated to the json page
Link
and could not find these variables? maybe im in the wrong page? Screenshots will help
here is the code
import requests
session_ids = ['13619' ,'13736']
for session_id in session_ids:
url = 'https://cdn-solr.asco.org/solr/ml/mlselect'
payload = '?_format=json&wt=json&indent=true&q=SessionId:' + session_id + '&start=0&rows=30&sort=score%20desc,%20SessionId%20asc&fq=RecordType:sessions&facet=true&f.Year.facet.sort=index&facet.field={!key=Year}Year&facet.field={!key=subject_thes}subject_thes&facet.field={!key=MediaTypes}MediaTypes&facet.field={!key=fctSessionType}fctSessionType&facet.pivot={!key=MeetingName}fctMeetingName,fctTrack&spellcheck.maxCollationTries=100'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
jsonData = requests.get(url+payload, headers=headers).json()
sessionParticipationID = jsonData['response']['docs'][0]['SessionParticipationID']
session_id_list = '%20OR%20'.join(sessionParticipationID)
payload = '?_format=json&wt=json&indent=true&sort=PresentationOrderWithinSession%20asc,%20ISODateString%20asc,%20ISODateStringEnd%20asc&fl=_id,%20score,%20ISODateString,%20ISODateStringEnd,%20ISODateString_1,%20ISODateStringEnd_1,%20Year,%20Title,%20tempAbstractID,%20MediaID,%20VideoID,%20EdBookID,%20edBookTitle,%20PosterID,%20edBookTitle,%20SessionTitle,%20SessionTypeId,%20AuthorString,%20AbstID,%20Role,%20FullName,%20PosterBoard,%20Institution,%20ProgramTitle,%20MeetingName,%20FirstAuthor&q=_id:(' + session_id_list + ')&rows=' + str(len(sessionParticipationID))
jsonData = requests.get(url+payload, headers=headers).json()
title_auth = [] #<-- to make a list of {title:author} dictionary
for each in jsonData['response']['docs']:
title = each['Title'] #this line
author = each['AuthorString'] #and this

Python scraping gmc-uk.org

I am trying to scrape a website : https://www.gmc-uk.org/doctors/register/LRMP.asp
Below is the code i wrote & its not working :
import requests, csv, re, sys
from lxml import html
def parser1(keyword,source):
with open(str(keyword)+'.csv','wb')as export:
writer = csv.writer(export)
for each in re.findall('<tr><td class="listapplettablerows" >(.+?)</tr>',source,re.DOTALL):
new_each = '<td class="listapplettablerows" >'+each
source = html.fromstring(new_each)
lines = source.xpath('//td[#class="listapplettablerows"]//text()')
#print (lines)
try:
writer.writerow([lines[0],lines[1],lines[2],lines[3],lines[4],lines[5],lines[6]])
except:
writer.writerow([lines[0],lines[1],lines[2],lines[3],lines[4],None,lines[5]])
def make_requests(url,keyword,SWETS):
s = requests.Session()
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.9',
'Host':'webcache.gmc-uk.org',
#'Cookie':'_ga=GA1.2.1612314458.1511275507; _gid=GA1.2.1054886815.1511275507',
'Referer':'http://webcache.gmc-uk.org/gmclrmp_enu/start.swe?SWENeedContext=false&SWECmd=GetCachedFrame&W=t&SWEACn=7691&_sn=AVN6CAdOO0TLfHYEWmkfiCc5NXsWqEWnu1QinbOLc8NU.5VYcL46LP-V1h1wBqvlQYqNVBRCbMk6wOV9ByGHIw6-NgaeeOCxe-VxSekkxnLHXZZSKGnrBiJaYUTe-S7K.d3nInri.S4wG6fk0CD4JAEKBxpsYv8C0hibwdV3LcAlTqBpiFSlHFjguoh8q8WZOtzdmX07Geg_&SWEC=1&SWEFrame=top._sweclient._sweview&SWEBID=-1&SRN=&SWETS=',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
s.get('https://www.gmc-uk.org/doctors/register/LRMP.asp')
r = s.get(url)
formdata = {'s_3_1_5_0':'',
's_3_1_3_0':'',
's_3_1_9_0':keyword,
's_3_1_6_0':'60',
's_3_1_4_0':'40',
's_3_1_7_0':'',
'SWEFo':'SWEForm3_0',
'SWEField':'s_3_1_10_0',
'SWENeedContext':'true',
'SWENoHttpRedir':'true',
'W':'t',
'SWECmd':'InvokeMethod',
'SWEMethod':'NewQuerySearch',
'SWERowIds':'',
'SWESP':'false',
'SWEVI':'',
'SWESPNR':'',
'SWEPOC':'',
'SWESPNH':'',
'SWEH':'',
'SWETargetView':'',
'SWEDIC':'false',
'_sn':url.split('_sn=')[1].split('&')[0],
'SWEReqRowId':'1',
'SWEView':'GMC WEB Doctor Search',
'SWEC':'1',
'SWERowId':'VRId-0',
'SWETVI':'',
'SWEW':'',
'SWEBID':re.findall('navigator.id = "(.+?)"',r.text,re.DOTALL)[0],
'SWEM':'',
'SRN':'',
'SWESPa':'',
'SWETS':SWETS,
'SWEContainer':'',
'SWEWN':'',
'SWEKeepContext':'0',
'SWEApplet':'GMC WEB Health Provider Search Applet',
'SWETA':''}
headers['Referer'] = url
r1 = s.post('http://webcache.gmc-uk.org/gmclrmp_enu/start.swe',data=formdata)
if 'Sorry but we cannot find a record that matches your search' not in r1.text:
parser1(keyword,r1.text)
make_requests(sys.argv[1],sys.argv[2],sys.argv[3])
The problem is the SWETS key in the formdata dictionary , when i inspected network elements i found out that website takes a POST request with SWETS as 13 digit GMT linux datetime stamp . but i am unable to find out how to use the correct 13 digit stamp as i do not find anything such from js responses from the server & when i send a generated 13 digit GMT linux datetime stamp it says invalid input . Please take a look & advice possible steps.

Categories

Resources