Please I need some support from you all. It’s a practice python code used for scarping employee information/user URL from Linkedin, this code can currently only print those user name and their current position, however, the showing result is also incomplete (Some of them are just their name without the role in the company) In the end, the user URL could not printenter image description here out.
import random
import argparse
import requests
import re
parser = argparse.ArgumentParser(description='Searches Google For Linkedin Profiles')
parser.add_argument('--keyword', type=str, help='keywords to search')
parser.add_argument('--limit', type=int, help='how many profiles to scrape')
args = parser.parse_args()
class LinkedinScraper(object):
def __init__(self, keyword, limit):
#:param keyword: a str of keyword(s) to search for
#:param limit: number of profiles to scrape
self.keyword = keyword.replace(' ', '%20')
self.all_htmls = ""
self.server = 'www.google.com'
self.quantity = '100'
self.limit = int(limit)
self.counter = 0
def search(self):
#perform the search
#:return: a list of htmls from Google Searches
# choose a random user agent
user_agents = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.142 Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Windows NT 5.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00'
]
while self.counter < self.limit:
headers = {'User-Agent': random.choice(user_agents)}
url = 'http://google.com/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.keyword
resp = requests.get(url, headers=headers)
if ("Our systems have detected unusual traffic from your computer network.") in resp.text:
print("Running into captchas")
return
self.all_htmls += resp.text
self.counter += 10
def parse_links(self):
reg_links = re.compile("url=https://www.linkedin.com(.*?)&")
self.temp = reg_links.findall(self.all_htmls)
results = []
for regex in self.temp:
final_url = regex.replace("url= ", "")
results.append("https://www.linkedin.com" + final_url)
return results
def parse_people(self):
# :param html: parse the html for Linkedin Profiles using regex
# :return: a list of
reg_people = re.compile(r'>[a-zA-Z0-9._ -]* -|\| LinkedIn')
self.temp = reg_people.findall(self.all_htmls)
print(self.temp)
results = []
for iteration in (self.temp):
delete = iteration.replace(' | LinkedIn', '')
delete = delete.replace(' - LinkedIn', '')
delete = delete.replace(' profiles ', '')
delete = delete.replace('LinkedIn', '')
delete = delete.replace('|', '')
delete = delete.replace('"', '')
delete = delete.replace('>', '')
delete = delete.strip("-")
if delete != " ":
results.append(delete)
return results
if __name__ == "__main__":
ls = LinkedinScraper(keyword="Tesla",limit=100)
ls.search()
links = ls.parse_links()
print(links)
profiles = ls.parse_people()
print(*profiles,sep="\n")
Related
from bs4 import BeautifulSoup
import requests
import pymongo
def traverse_source():
article_links = []
for pgindx in range(9):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36',
"path": f"issue/S0196-0644(21)X0012-1?pageStart={pgindx}",
"Sec-fetch-site": "same-origin",
}
source_url = ""
source_data = requests.get(source_url,headers = headers)
print(source_data.headers)
source_url = None
source_soup = BeautifulSoup(source_data.content,"html.parser")
destination = source_soup.find_all("h3",attrs = {'class': 'toc__item__title' })
for dest in destination:
try:
article_links.append("https://www.annemergmed.com"+dest.a['href'])
except:
pass
source_soup = None
print(article_links)
if __name__ == "__main__":
traverse_source()
Here even after incrementing the page number in the URL, the content of the first webpage is always scraped. I tried navigating through the pages using GET method (changing the URL) but still even after changing the source url, it is still scraping the data of page number 1
This is one way of scraping that data:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}
s = requests.Session()
s.headers.update(headers)
big_list = []
for x in tqdm(range(9)):
r = s.get(f'https://www.annemergmed.com/issue/S0196-0644(21)X0012-1?pageStart={x}')
soup = BeautifulSoup(r.text, 'html.parser')
titles = soup.select('div.articleCitation')
for t in titles:
url = t.select_one('h3 a').get('href')
header = t.select_one('h3 a').text
try:
authors = t.select_one('ul.toc__item__authors').get_text(strip=True)
except Exception as e:
authors = 'Unknown'
big_list.append((header, f'https://www.annemergmed.com{url}', authors))
df = pd.DataFrame(list(set(big_list)), columns = ['Title', 'Url', 'Authors'])
print(df.shape)
print(df.head(50))
This will return:
(409, 3)
Title Url Authors
0 194 Challenging the Dogma of Radiographs a Joint Above and Below a Suspected Fracture: Quantification of Waste in Wrist Fracture Evaluation https://www.annemergmed.com/article/S0196-0644(21)01046-5/fulltext M. Rozum,D. Mark Courtney,D. Diercks,S. McDonald
1 112 A Geographical Analysis of Access to Trauma Care From US National Parks in 2018 https://www.annemergmed.com/article/S0196-0644(21)00963-X/fulltext S. Robichaud,K. Boggs,B. Bedell,...A. Sullivan,N. Harris,C. Camargo
2 87 Emergency Radiology Overreads Change Management of Transferred Patients With Traumatic Injuries https://www.annemergmed.com/article/S0196-0644(21)00937-9/fulltext M. Vrablik,R. Kessler,M. Vrablik,...J. Robinson,D. Hippe,M. Hall
[...]
I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".
This code is from a separate submission.
If you look at the lines :
each[AuthorString]
each[Title]
im wondering where the user got these variables from?
I navigated to the json page
Link
and could not find these variables? maybe im in the wrong page? Screenshots will help
here is the code
import requests
session_ids = ['13619' ,'13736']
for session_id in session_ids:
url = 'https://cdn-solr.asco.org/solr/ml/mlselect'
payload = '?_format=json&wt=json&indent=true&q=SessionId:' + session_id + '&start=0&rows=30&sort=score%20desc,%20SessionId%20asc&fq=RecordType:sessions&facet=true&f.Year.facet.sort=index&facet.field={!key=Year}Year&facet.field={!key=subject_thes}subject_thes&facet.field={!key=MediaTypes}MediaTypes&facet.field={!key=fctSessionType}fctSessionType&facet.pivot={!key=MeetingName}fctMeetingName,fctTrack&spellcheck.maxCollationTries=100'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.85 Safari/537.36'}
jsonData = requests.get(url+payload, headers=headers).json()
sessionParticipationID = jsonData['response']['docs'][0]['SessionParticipationID']
session_id_list = '%20OR%20'.join(sessionParticipationID)
payload = '?_format=json&wt=json&indent=true&sort=PresentationOrderWithinSession%20asc,%20ISODateString%20asc,%20ISODateStringEnd%20asc&fl=_id,%20score,%20ISODateString,%20ISODateStringEnd,%20ISODateString_1,%20ISODateStringEnd_1,%20Year,%20Title,%20tempAbstractID,%20MediaID,%20VideoID,%20EdBookID,%20edBookTitle,%20PosterID,%20edBookTitle,%20SessionTitle,%20SessionTypeId,%20AuthorString,%20AbstID,%20Role,%20FullName,%20PosterBoard,%20Institution,%20ProgramTitle,%20MeetingName,%20FirstAuthor&q=_id:(' + session_id_list + ')&rows=' + str(len(sessionParticipationID))
jsonData = requests.get(url+payload, headers=headers).json()
title_auth = [] #<-- to make a list of {title:author} dictionary
for each in jsonData['response']['docs']:
title = each['Title'] #this line
author = each['AuthorString'] #and this
I am trying to gather some information about some books available on Amazon and I am having a weird glitch error that I can't understand. At first I thought it was Amazon blocking my connection but then I noticed the request has a "200 OK" and it had the real HTML content of the corresponding page.
Let's take for example this book: https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110/ref=sr_1_1?crid=2PPCQEJD706VY&dchild=1&keywords=books+bestsellers+2020+paperback&qid=1598132071&sprefix=book%2Caps%2C234&sr=8-1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
price = {}
if soup.select("#buyBoxInner > ul > li > span > .a-text-strike") != []:
price["regular_price"] = float(
soup.select("#buyBoxInner > ul > li > span > .a-text-strike")[0].string[1:].replace(",", "."))
price["promo_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
else:
price["regular_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
price["currency"] = soup.select(".offer-price")[0].string[0]
This part works fine and I can have the regular price and a promo price (if exists), and even the currency. But when I do this:
isbn = soup.select("td.bucket > .content > ul > li")[4].contents[1].string.strip().replace("-", "")
I get "IndexError: list index out of range". But if I debug the code, the content is actually there!
Is this a bug of BeautifulSoup? Is the request response too long?
It seems that Amazon returns two version of the page. One where's <td class="bucket"> and one where are several <span> tags. This script tries to extract ISBN from both of them:
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
isbn_10 = soup.select_one('span.a-text-bold:contains("ISBN-10"), b:contains("ISBN-10")').find_parent().text
isbn_13 = soup.select_one('span.a-text-bold:contains("ISBN-13"), b:contains("ISBN-13")').find_parent().text
print(isbn_10.split(':')[-1].strip())
print(isbn_13.split(':')[-1].strip())
Prints:
0241985110
978-0241985113
I wish I had an explanation of the problem but you a solution would be to wrap your code in a function like so:
def scrape():
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
url = 'https://www.amazon.co.uk/All-Rage-Cara-Hunter/dp/0241985110/ref=sr_1_1?crid=2PPCQEJD706VY&dchild=1&keywords=books+bestsellers+2020+paperback&qid=1598132071&sprefix=book%2Caps%2C234&sr=8-1'
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.content, features="lxml")
price = {}
if soup.select("#buyBoxInner > ul > li > span > .a-text-strike") != []:
price["regular_price"] = float(
soup.select("#buyBoxInner > ul > li > span > .a-text-strike")[0].string[1:].replace(",", "."))
price["promo_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
else:
price["regular_price"] = float(soup.select(".offer-price")[0].string[1:].replace(",", "."))
price["currency"] = soup.select(".offer-price")[0].string[0]
#ADD THIS FEATURE TO YOUR CODE
isbn = soup.select("td.bucket > .content > ul > li")
if not isbn:
scrape()
isbn = isbn[4].contents[1].string.strip().replace("-", "")
Then if it fails it will just call itself again. You might want to refactor it so it only makes the request once.
So basically, websites have their Skype id on their website in this format: Skype ID: USERNAMEWOULDBEHERE or Skype: USERNAMEWOULDBEHERE
I'm just trying to extract their usernames/Skype ID.
Am I doing anything wrong? How would I check for both strings? (Skype: & Skype ID:)
Help is much appreciated. I'm a beginner in Python so please go easy with me lol.
#!/usr/bin/env python2
# -*- coding: utf8 -*-
import sys
import time
import random
import argparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.common.exceptions import NoSuchFrameException
from selenium.webdriver.common.keys import Keys
# If this script no longer fetches any results check the XPath
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--search', help='Enter the search term')
parser.add_argument('-p', '--pages', default='1', help='Enter how many pages to scrape (1 page = 100 results)')
return parser.parse_args()
def start_browser():
br = webdriver.Firefox()
br.implicitly_wait(10)
return br
def get_ua():
ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']
ua = random.choice(ua_list)
return ua
def scrape_results(br):
links = br.find_elements_by_xpath("Skype ID: ")
results = []
for link in links:
title = link.text.encode('utf8')
url = link.get_attribute('href')
title_url = (title, url)
results.append(title_url)
return results
def go_to_page(br, page_num, search_term):
page_num = page_num - 1
start_results = page_num * 100
start_results = str(start_results)
url = 'https://www.google.com/webhp?#num=100&start='+start_results+'&q='+search_term
print '[*] Fetching 100 results from page '+str(page_num+1)+' at '+url
br.get(url)
time.sleep(2)
def main():
args = parse_args()
br = start_browser()
if not args.search:
sys.exit("[!] Enter a term or phrase to search with the -s option: -s 'dan mcinerney'")
search_term = args.search
pages = args.pages
all_results = []
for page_num in xrange(int(pages)):
page_num = page_num+1 # since it starts at 0
go_to_page(br, page_num, search_term)
titles_urls = scrape_results(br)
for title in titles_urls:
all_results.append(title)
for result in all_results:
title = result[0]
url = result[1]
print '[+]', title, '--', url
br.quit()
if __name__ == "__main__":
main()