first, i'm sorry for this long message but i have an issue that is blocking me from advancing in my project: Let me first explain quickly the workflow, User enters a search query -> making a search in linkedin with this query -> grabbing urls of users (in function of nb of pages) -> search for these users in proxycurl (https://nubela.co/proxycurl/docs#people-api-person-lookup-endpoint) -> grab their infos with a function -> store them in my db -> grabs infos about the experiences of the scraped users -> make a search in proxycurl API again but for the companies this time -> grab infos about companies and store them in db -> search infos about employees in this company (https://nubela.co/proxycurl/docs#company-api-employee-search-api-endpoint) -> grab url of the CTO -> search in the contact API to grab the infos about the CTO (https://nubela.co/proxycurl/docs#contact-api-personal-contact-number-lookup-endpoint and https://nubela.co/proxycurl/docs#contact-api-personal-email-lookup-endpoint) -> store everything in database.
Ok so i manage to grab urls, search for the users in api, but i never manage to get the 'extra' information with my code while i can grab them for the same profiles in Postman, same for personnal_email, personnal_contact_number, github_profile_id.
Then i manage to grab the data about the companies, but still same problem, can't retrieve the 'extra' information, or the 'funding_data' or 'acquisitions' even if i include them in my code.
I really don't know what's wrong with my code (i'm assuming something's wrong because everything works perfectly with postman), and i can take a little help here, thanks for your time ! (Full code below)
from telnetlib import EC
import requests
from datetime import datetime
import json
import re
from cgitb import text
import selenium
from selenium.webdriver.support.ui import WebDriverWait
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.action_chains import ActionChains
from bs4 import BeautifulSoup, NavigableString, Tag
from time import sleep
from time import time
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.options import Options
import csv
import firebase_admin
from firebase_admin import credentials
from firebase_admin import db
import openpyxl
import requests
cred = credentials.Certificate(r"C:\Users\radia\Downloads\st-londres-2-firebase-adminsdk-7eowq-786e799875.json")
firebase_admin.initialize_app(cred, {
'databaseURL': 'https://st-londres-2-default-rtdb.firebaseio.com/'
})
print('- Importation des packages')
# Task 1: webdriver configuration
driver = webdriver.Chrome(ChromeDriverManager().install())
# Task 1.1: Open Chrome and Access Linkedin
sleep(2)
url = 'https://www.linkedin.com/login'
driver.get(url)
print('Initialisation du chrome driver')
sleep(2)
# Task 1.2: Import username and password
credential = open(r"C:\Users\radia\OneDrive\Bureau\credentials.txt")
line = credential.readlines()
username = line[0]
password = line[1]
print('Importation des id')
sleep(2)
# Task 1.2: Key in login credentials
email_field = driver.find_element(By.ID, 'username')
email_field.send_keys(username)
print('Email ok')
sleep(3)
password_field = driver.find_element(By.NAME, 'session_password')
password_field.send_keys(password)
print('Mdp ok')
sleep(2)
# Task 1.2: Click the Login button
signin_field = driver.find_element(By.XPATH, '//*[#id="organic-div"]/form/div[3]/button')
signin_field.click()
sleep(3)
print('- Task A: Connexion à Linkedin')
search_field = driver.find_element(By.XPATH, '//*[#id="global-nav-typeahead"]/input')
search_query = input('Type of profile to scrape ')
search_field.send_keys(search_query)
search_field.send_keys(Keys.RETURN)
print('TASK B OK')
sleep(10)
try:
driver.find_element(By.XPATH, "//*[#id='search-reusables__filters-bar']/ul/li[2]/button").click()
except selenium.common.exceptions.NoSuchElementException:
print("Element not found")
def GetURL(): #function to grab linkedin urls
page_source = BeautifulSoup(driver.page_source, features='lxml')
a_elements = page_source.find_all('a', {'class': "app-aware-link"})
all_urls = []
for element in a_elements:
url = element.get('href')
all_urls.append(url)
return all_urls
##Pagination
sleep(2)
input_page = int(input('Nombre de pages à scraper: '))
URLs_all_page = []
for page in range(input_page):
URLs_one_page = GetURL()
sleep(2)
driver.execute_script('window.scrollTo(0, document.body.scrollHeight);') #scrolling to the end of the page
sleep(3)
next_button = driver.find_element(By.XPATH, '//button[contains(#class, "artdeco-pagination__button--next") and .//li-icon]')
driver.execute_script("arguments[0].click();", next_button)
sleep(2)
if URLs_one_page is not None:
URLs_all_page = URLs_all_page + URLs_one_page
print(URLs_all_page)
else:
print('variable stores a None value')
sleep(2)
print(URLs_all_page)
sleep(1)
def get_profile_info(url): # function to make api calls for users
api_endpoint = 'https://nubela.co/proxycurl/api/v2/linkedin'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
params = {
'url': url,
'fallback_to_cache': 'on-error',
'use_cache': 'if-present',
'skills': 'include',
'inferred_salary': 'include',
'personal_email': 'include',
'personal_contact_number': 'include',
'twitter_profile_id': 'include',
'facebook_profile_id': 'include',
'github_profile_id': 'include',
'extra': 'include',
}
try:
response = requests.get(api_endpoint, headers=headers, params=params)
if response.status_code != 404:
data_profile = response.json()
return data_profile
else:
return None
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_info(url): #function to make api calls for companies
api_key = 'SDrD73S2fXlvCMdFDExEaw'
headers = {'Authorization': 'Bearer ' + api_key}
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company'
params = {
'resolve_numeric_id': 'true',
'categories': 'include',
'funding_data': 'include',
'extra': 'include',
'exit_data': 'include',
'acquisitions': 'include',
'url': 'include',
'use_cache': 'if-present',
}
try:
response = requests.get(api_endpoint, params={'url':url}, headers=headers)
if response.status_code == 404:
print("Company not found for URL:", url)
return None
else:
data_company = response.json()
print(data_company)
if 'extra' in data_company:
print("Extra information found:", data_company['extra'])
else:
print("No extra information found in JSON response.")
return data_company
except requests.exceptions.RequestException as e:
print (e)
return None
def get_company_employee_url(company_linkedin_profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/linkedin/company/employee/search/'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'page_size': '10',
'linkedin_company_profile_url': company_linkedin_profile_url,
'keyword_regex': '[Cc][Tt][Oo]',
'enrich_profiles': 'enrich',
'resolve_numeric_id': 'false',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
print(response.status_code)
print(response.text)
if response.status_code == 404:
print("No employees found for URL:", url)
return None
else:
data_employees = response.json()
if 'employees' in data_employees:
print("Employees found:", data_employees['employee_search_results'])
else:
print("No employees found in JSON response.")
#return and store profile_url in data_employees:
for employee in data_employees['employee_search_results']:
profile_url = employee['profile_url']
print(profile_url)
def get_company_employee_info(profile_url):
api_endpoint = 'https://nubela.co/proxycurl/api/contact-api/personal-contact'
api_key = 'SDrD73S2fXlvCMdFDExEaw'
header_dic = {'Authorization': 'Bearer ' + api_key}
params = {
'linkedin_profile_url': 'https://linkedin.com/in/test-phone-number',
}
response = requests.get(api_endpoint,
params=params,
headers=header_dic)
# Initialize visited URLs + data_list
visited_urls = []
for url in URLs_all_page:
if url in visited_urls:
print("Profile already exists in the database for URL:", url)
continue
data = get_profile_info(url)
if data and "error" in data:
print(data["error"])
if not data or "experiences" not in data:
continue
data["search_query"] = search_query # Add the search_query to the data
db.reference('profiles').push(data) # Store data in the candidates table
visited_urls.append(url)
print("Profile data and search query successfully added to the candidates table for URL:", url)
for item in data['experiences']:
company_name = str(item['company'])
company_name_push = re.sub(r'[^a-zA-Z0-9]', '', company_name) # Error handling when pushing code to db, replacement of illegal values
company_linkedin_profile_url = item['company_linkedin_profile_url']
company_description = item['description']
company_data = get_company_info(company_linkedin_profile_url)
if company_name_push:
filtered_company = db.reference('companies/'+ company_name_push).get()
else:
continue
if filtered_company is None:
db.reference('companies').push({
'company_name': company_name_push,
'company_linkedin_profile_url': company_linkedin_profile_url,
'company_description': company_description,
'company_data': company_data
})
print("Company data successfully added for URL:", company_linkedin_profile_url)
else:
print("Company already exists in the database for URL:", company_linkedin_profile_url)
experiences = {
'candidate_name': data['full_name'],
'title': item['title'],
'company': item['company'],
'location': item['location'],
'start_date': item['starts_at'],
'end_date': item['ends_at'],
'description': item['description'],
}
db.reference('experiences').push(experiences)
company_employee_url = get_company_employee_url(company_linkedin_profile_url)
company_employee_data = get_company_employee_info(company_employee_url)
if company_employee_data:
db.reference('company_employees/' + company_name_push).push(company_employee_data)
print("Company employee data successfully added for company:", company_name)
else:
print("No data found for company employees for company:", company_name)
Related
I did a project, where I looped over every single student ID I have in my college to get the results of each individual student to create analytical dashboard for each student and send their results to them via Email with a nice report done later. I scraped the website our college uploads our results in.
the code for it was this:
#Importing The Neccessary modules
import pandas as pd
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.options import Options
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
#Reading the data
our_ids = pd.read_excel("All Our IDs.xlsx")
total_students = our_ids.shape[0]
df_to_hold_all_data = pd.DataFrame()
#Defining Functions to use in the script
def make_request(student_id):
"""
Makes a response for the student ID given, Keeps repeating it till it's a successful response.
"""
url = 'http://app1.helwan.edu.eg/Commerce/HasasnUpMlist.asp' #Base URL to our college website
params = {
'z_dep': '=',
'z_st_name': 'LIKE',
'z_st_settingno': '=',
'x_st_settingno': f'{student_id}',
'x_st_name': '',
'z_gro': '=',
'x_gro': '',
'x_dep': '',
'z_sec': 'LIKE',
'x_sec': '',
'Submit': '++++حفظ++++'
}
response_state = 0
while response_state != 200 :
try:
response = requests.get(url,params= params, timeout= 10 )
except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectTimeout):
print("Requesting Again...")
continue
response_state = response.status_code
return response
def make_the_second_request_with_selenium(link):
# Create a headless Edge driver
options = Options()
options.add_argument('--headless')
driver = webdriver.Edge(options=options)
# Set timeout for the request and try to navigate to a website
timeout = 10 # seconds
try:
driver.get(link)
WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b')))
return driver # Will Eventually return this.
except (TimeoutException, NoSuchElementException): # If the request takes more than 10 seconds or the request failed for any reason, repeat the request again
print("Requesting Again...")
make_the_second_request_with_selenium(link)
this_loop = 0
#Looping for all students
for student_id in our_ids['IDS'].unique():
print(f"\nNow Looping for {student_id}\n")
response = make_request(student_id) # Making our response
print(f"{response.status_code}")
# Parse the response and create a BeautifulSoup object
soup = BeautifulSoup(response.text, 'html.parser')
links = soup.find_all('a',{'href': True})
link_to_natega = ''
for link in links:
if "StdCode" in link['href']:
# get the link we want to go to eventually, Each Student has a unique link.
link_to_natega = f"http://app1.helwan.edu.eg/Commerce/{link['href']}"
print(link_to_natega)
try:
driver = make_the_second_request_with_selenium(link_to_natega)
name = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[2]/div/font/b').text
id_of_student = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[3]/td[4]/div/font/b').text
department = driver.find_element(By.XPATH,'/html/body/form/div/table[1]/tbody/tr[5]/td[2]/div/font/b').text
first_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[2]/div/font/b').text
first_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[3]/td[4]/div/font/b').text
second_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[2]/div/font/b').text
second_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[4]/td[4]/div/font/b').text
third_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[2]/div/font/b').text
third_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[5]/td[4]/div/font/b').text
fourth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[2]/div/font/b').text
fourth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[6]/td[4]/div/font/b').text
fifth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[2]/div/font/b').text
fifth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[7]/td[4]/div/font/b').text
sixth_sub = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[2]/div/font/b').text
sixth_sub_score = driver.find_element(By.XPATH,'/html/body/form/div/table[2]/tbody/tr[8]/td[4]/div/font/b').text
data = {'name': name , 'ID' : id_of_student , "Department" : department , \
"Subject" : [first_sub,second_sub,third_sub,fourth_sub,fifth_sub,sixth_sub],\
"Score": [first_sub_score,second_sub_score,third_sub_score,fourth_sub_score,fifth_sub_score,sixth_sub_score]
}
df = pd.DataFrame(data) #Create a DataFrame
df_to_hold_all_data = df_to_hold_all_data.append(df) # Append it to the dataframe we created above.
# Close the driver
driver.quit()
print(f"The shape of the data now is: {df_to_hold_all_data.shape}")
except:
print(f'failed to get data for {student_id}')
this_loop += 1
remaining_students = total_students - this_loop
print(f'Done Looping For {student_id} The remaining students: {remaining_students}')
df_to_hold_all_data.to_excel("All Our Results.xlsx",index=False)
I don't know if it's possible to create this with scrapy?
If yes, How much would it make the process faster?
Is it worth investing the time and effort to learn it and rewrite the code again?
edit: Sorry for poor structure, Data Analysis and statistics is the part where I am actually good :D
Your help would be appreciated.
I'm trying to get the destination of a bunch of t.co links from Twitter. I can get this for active links, but when they are 404 or dead links, the program dies. If I enter this into the browser, it shows me the destination URL.
Is there a way to do this in Python 3?
This is my existing code:
import requests
import pandas as pd
from requests.models import Response
# Loading my array of links
data = pd.read_json('tco-links.json')
links = pd.DataFrame(data)
output = []
session = requests.Session() # so connections are recycled
with open('output.json', 'w') as f:
for index, row in links.iterrows():
fullLink = 'http://' + row['link']
try:
response = session.head(fullLink, allow_redirects=True)
except:
# how I'm handling errors right now
response = Response()
response.url = 'Failed'
output.append({
'link': fullLink,
'id': row['id'],
'unshortened': response.url
})
for x in output:
f.write(json.dumps(x) + '\n')
I was trying to make a crawler to follow links, with this code
import scrapy
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def start_requests(self):
url = "https://www.mediktor.com/pt-br/glossario"
options = Options()
options.headless = True
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(10)
doencas = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
for doenca in doencas:
url = doenca.get_attribute('href')
yield scrapy.Request(url)
driver.quit()
def parse(self, response):
urls = response.css(
'.mdk-dictionary-list__glossary-item a::attr(href)')
for url in urls:
yield response.follow(url.get(), callback=self.parse_info)
def parse_info(self, response):
contents = response.css('div.page-glossary-detail__main-content')
for desc in response.css('div.mdk-conclusion-detail__main-description'):
desc = response.css('p ::text').getall()
yield {
'desc': desc
}
for content in contents:
yield{
'name': content.css(
'div.mdk-conclusion-detail__main-title ::text').get().strip(),
'espec': content.css(
'div.mdk-ui-list-item__text mdc-list-item__text span::text').strip()
}
I was able to get the links but the part of entering the links and getting the information I need was not working, so a friend helped me to come up with this code
import pandas as pd
import requests
from bs4 import BeautifulSoup
def get_auth_code():
url = "https://www.mediktor.com/vendor.js"
response = requests.get(url)
start_index = response.text.index('APP_API_AUTH_CODE:"', 0) + len('APP_API_AUTH_CODE:"')
end_index = response.text.index('"', start_index)
return response.text[start_index:end_index]
def get_auth_token_and_device_id():
url = "https://euapi01.mediktor.com/backoffice/services/login"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"authTokenRefreshExpiresIn\":null}"
headers = {
'authorization': f'Basic {get_auth_code()}',
'Content-Type': 'text/plain'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.json()['authToken'], response.json()['deviceId']
def get_conclusion_list(auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionList"
payload = "{\"useCache\":168,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"" \
",\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"pt_BR\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"}"
headers = {
'accept': 'application/json, text/plain, */*',
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return [conclusionId['conclusionId'] for conclusionId in response.json()['conclusions']]
def get_details(conclusionId, auth_token, device_id):
url = "https://euapi01.mediktor.com/backoffice/services/conclusionDetail"
payload = "{\"useCache\":0,\"apiVersion\":\"4.1.1\",\"appVersion\":\"8.7.0\"," \
"\"appId\":null,\"deviceType\":\"WEB\",\"deviceToken\":null,\"language\":\"en_EN\"," \
"\"timezoneRaw\":180,\"deviceId\":\"" + device_id + "\"," \
"\"conclusionId\":\"" + conclusionId + "\"," \
"\"conclusionTemplate\":\"conclusion_description_body\",\"includeActions\":true}"
headers = {
'authorization': f'Bearer {auth_token}',
'content-type': 'application/json;charset=UTF-8'
}
response = requests.request("POST", url, headers=headers, data=payload)
return response.text
auth_token, device_id = get_auth_token_and_device_id()
conclusion_list = get_conclusion_list(auth_token, device_id)
for conclusion in conclusion_list:
print(get_details(conclusion, auth_token, device_id))
It gets the json with the page items, but in loop number 230 it starts returning the following error and won't leave the loop
{"error":{"code":"ME667","description":"Expired user identification token.","retry":true}}
What I'd like to do is, pass this all to a file so I can see if it's getting all the items on the page I need and then leave a json with just the information I need, not everything from the site as it's returning now
I after many sleepless nights solved my problem, I will leave it here in case it helps someone.
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import json
class DicionarioSpider(scrapy.Spider):
name = 'dicionario'
allowed_domains = ['www.mediktor.com']
start_urls = ['http://www.mediktor.com/']
def parse(self, response):
url = "https://www.mediktor.com/pt-br/glossario"
option = Options()
option.headless = True
driver = webdriver.Chrome(options=option)
driver.get(url)
time.sleep(10)
el_links = driver.find_elements(
By.XPATH, "//a[#class='mdk-dictionary-list__glossary-item']")
urls = []
nome_doenca = []
for i in range(len(el_links)):
urls.append(el_links[i].get_attribute('href'))
for link in urls:
driver.get(link)
myElem = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
)))
nome_source = driver.find_element(By.XPATH,
"//div[#class='mdk-conclusion-detail__main-title']"
).text
nome_doenca.append(nome_source)
driver.back()
print(nome_doenca)
driver.quit()
I just modified my code and didn't use scrapy, just the selenium selectors.
How would I be able to continuously and automatically scrape from a website. For example get the trending quotes every 6 hours from the tsx website and update my JSON file according? In other words, continuously execute my python script every 6 hours automatically
Reason behind this, I will be using my JSON file in my own personal website to output the data, styled in html and css. So whenever someone comes to my site, the content would have updated from the previous visit, if the tsx trending quotes updated.
# grabs all the trending quotes for that day
def getTrendingQuotes(browser):
# wait until trending links appear, not really needed only for example
all_trendingQuotes = WebDriverWait(browser, 10).until(
lambda d: d.find_elements_by_css_selector('#trendingQuotes a')
)
return [link.get_attribute('href') for link in all_trendingQuotes]
def getStockDetails(url, browser):
print(url)
browser.get(url)
quote_wrapper = browser.find_element_by_css_selector('div.quote-wrapper')
quote_name = quote_wrapper.find_element_by_class_name(
"quote-name").find_element_by_tag_name('h2').text
quote_price = quote_wrapper.find_element_by_class_name("quote-price").text
quote_volume = quote_wrapper.find_element_by_class_name(
"quote-volume").text
print("\n")
print("Quote Name: " + quote_name)
print("Quote Price: " + quote_price)
print("Quote Volume: " + quote_volume)
print("\n")
convertToJson(quote_name, quote_price, quote_volume, url)
quotesArr = []
# Convert to a JSON file
def convertToJson(quote_name, quote_price, quote_volume, url):
quoteObject = {
"url": url,
"Name": quote_name,
"Price": quote_price,
"Volume": quote_volume
}
quotesArr.append(quoteObject)
def trendingBot(url, browser):
browser.get(url)
trending = getTrendingQuotes(browser)
for trend in trending:
getStockDetails(trend, browser)
# requests finished, write json to file
with open('trendingQuoteData.json', 'w') as outfile:
json.dump(quotesArr, outfile)
def Main():
chrome_options = Options()
chrome_options.add_argument("--headless")
# applicable to windows os only
chrome_options.add_argument('--disable-gpu')
url = 'https://www.tmxmoney.com/en/index.html'
browser = webdriver.Chrome(
r"C:\Users\austi\OneDrive\Desktop\chromeDriver\chromedriver_win32\chromedriver.exe", chrome_options=chrome_options)
browser.get(url)
os.system('cls')
print("[+] Success! Bot Starting!")
trendingBot(url, browser)
# trendingBot(browser)
browser.quit()
if __name__ == "__main__":
Main()
I am trying to scrape some emails from mdpi.com, emails available only to logged in users. But it fails when I am trying to do so. I am getting
when logged out:
Code itself:
import requests
from bs4 import BeautifulSoup
import traceback
login_data = {'form[email]': 'xxxxxxx#gmail.com', 'form[password]': 'xxxxxxxxx', 'remember': 1,}
base_url = 'http://www.mdpi.com'
headers = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; rv:40.0) Gecko/20100101 Firefox/40.0'}
session = requests.Session()
session.headers = headers
# log_in
s = session.post('https://susy.mdpi.com/user/login', data=login_data)
print(s.text)
print(session.cookies)
def make_soup(url):
try:
r = session.get(url)
soup = BeautifulSoup(r.content, 'lxml')
return soup
except:
traceback.print_exc()
return None
example_link = 'http://www.mdpi.com/search?journal=medsci&year_from=1996&year_to=2017&page_count=200&sort=relevance&view=default'
def article_finder(soup):
one_page_articles_divs = soup.find_all('div', class_='article-content')
for article_div in one_page_articles_divs:
a_link = article_div.find('a', class_='title-link')
link = base_url + a_link.get('href')
print(link)
article_soup = make_soup(link)
grab_author_info(article_soup)
def grab_author_info(article_soup):
# title of the article
article_title = article_soup.find('h1', class_="title").text
print(article_title)
# affiliation
affiliations_div = article_soup.find('div', class_='art-affiliations')
affiliation_dict = {}
aff_indexes = affiliations_div.find_all('div', class_='affiliation-item')
aff_values = affiliations_div.find_all('div', class_='affiliation-name')
for i, index in enumerate(aff_indexes): # 0, 1
affiliation_dict[int(index.text)] = aff_values[i].text
# authors names
authors_div = article_soup.find('div', class_='art-authors')
authors_spans = authors_div.find_all('span', class_='inlineblock')
for span in authors_spans:
name_and_email = span.find_all('a') # name and email
name = name_and_email[0].text
# email
email = name_and_email[1].get('href')[7:]
# affiliation_index
affiliation_index = span.find('sup').text
indexes = set()
if len(affiliation_index) > 2:
for i in affiliation_index.strip():
try:
ind = int(i)
indexes.add(ind)
except ValueError:
pass
print(name)
for index in indexes:
print('affiliation =>', affiliation_dict[index])
print('email: {}'.format(email))
if __name__ == '__main__':
article_finder(make_soup(example_link))
What should I do in order to get what I want?
Ah that is easy, you haven't managed to log in correctly. If you look at the response from your initial call you will see that you are returned the login page HTML instead of the my profile page. The reason for this is that you are not submitted the hidden token on the form.
The solution request the login page, and then use either lxml or BeautifulSoup to parse the hidden input 'form[_token]'. Get that value and then add it to your login_data payload.
Then submit your login request and you'll be in.