Scrape/Extract Skype IDs from Google - python

So basically, websites have their Skype id on their website in this format: Skype ID: USERNAMEWOULDBEHERE or Skype: USERNAMEWOULDBEHERE
I'm just trying to extract their usernames/Skype ID.
Am I doing anything wrong? How would I check for both strings? (Skype: & Skype ID:)
Help is much appreciated. I'm a beginner in Python so please go easy with me lol.
#!/usr/bin/env python2
# -*- coding: utf8 -*-
import sys
import time
import random
import argparse
from selenium import webdriver
from selenium.webdriver.support.ui import Select, WebDriverWait
from selenium.common.exceptions import NoSuchFrameException
from selenium.webdriver.common.keys import Keys
# If this script no longer fetches any results check the XPath
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--search', help='Enter the search term')
parser.add_argument('-p', '--pages', default='1', help='Enter how many pages to scrape (1 page = 100 results)')
return parser.parse_args()
def start_browser():
br = webdriver.Firefox()
br.implicitly_wait(10)
return br
def get_ua():
ua_list = ['Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.131 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:29.0) Gecko/20100101 Firefox/29.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:28.0) Gecko/20100101 Firefox/28.0']
ua = random.choice(ua_list)
return ua
def scrape_results(br):
links = br.find_elements_by_xpath("Skype ID: ")
results = []
for link in links:
title = link.text.encode('utf8')
url = link.get_attribute('href')
title_url = (title, url)
results.append(title_url)
return results
def go_to_page(br, page_num, search_term):
page_num = page_num - 1
start_results = page_num * 100
start_results = str(start_results)
url = 'https://www.google.com/webhp?#num=100&start='+start_results+'&q='+search_term
print '[*] Fetching 100 results from page '+str(page_num+1)+' at '+url
br.get(url)
time.sleep(2)
def main():
args = parse_args()
br = start_browser()
if not args.search:
sys.exit("[!] Enter a term or phrase to search with the -s option: -s 'dan mcinerney'")
search_term = args.search
pages = args.pages
all_results = []
for page_num in xrange(int(pages)):
page_num = page_num+1 # since it starts at 0
go_to_page(br, page_num, search_term)
titles_urls = scrape_results(br)
for title in titles_urls:
all_results.append(title)
for result in all_results:
title = result[0]
url = result[1]
print '[+]', title, '--', url
br.quit()
if __name__ == "__main__":
main()

Related

#Python Web_Scraping_Linkedin_ User Incomplete result

Please I need some support from you all. It’s a practice python code used for scarping employee information/user URL from Linkedin, this code can currently only print those user name and their current position, however, the showing result is also incomplete (Some of them are just their name without the role in the company) In the end, the user URL could not printenter image description here out.
import random
import argparse
import requests
import re
parser = argparse.ArgumentParser(description='Searches Google For Linkedin Profiles')
parser.add_argument('--keyword', type=str, help='keywords to search')
parser.add_argument('--limit', type=int, help='how many profiles to scrape')
args = parser.parse_args()
class LinkedinScraper(object):
def __init__(self, keyword, limit):
#:param keyword: a str of keyword(s) to search for
#:param limit: number of profiles to scrape
self.keyword = keyword.replace(' ', '%20')
self.all_htmls = ""
self.server = 'www.google.com'
self.quantity = '100'
self.limit = int(limit)
self.counter = 0
def search(self):
#perform the search
#:return: a list of htmls from Google Searches
# choose a random user agent
user_agents = [
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36',
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) chromeframe/10.0.648.205',
'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.19 (KHTML, like Gecko) Ubuntu/11.10 Chromium/18.0.1025.142 Chrome/18.0.1025.142 Safari/535.19',
'Mozilla/5.0 (Windows NT 5.1; U; de; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6 Opera 11.00'
]
while self.counter < self.limit:
headers = {'User-Agent': random.choice(user_agents)}
url = 'http://google.com/search?num=100&start=' + str(self.counter) + '&hl=en&meta=&q=site%3Alinkedin.com/in%20' + self.keyword
resp = requests.get(url, headers=headers)
if ("Our systems have detected unusual traffic from your computer network.") in resp.text:
print("Running into captchas")
return
self.all_htmls += resp.text
self.counter += 10
def parse_links(self):
reg_links = re.compile("url=https://www.linkedin.com(.*?)&")
self.temp = reg_links.findall(self.all_htmls)
results = []
for regex in self.temp:
final_url = regex.replace("url= ", "")
results.append("https://www.linkedin.com" + final_url)
return results
def parse_people(self):
# :param html: parse the html for Linkedin Profiles using regex
# :return: a list of
reg_people = re.compile(r'>[a-zA-Z0-9._ -]* -|\| LinkedIn')
self.temp = reg_people.findall(self.all_htmls)
print(self.temp)
results = []
for iteration in (self.temp):
delete = iteration.replace(' | LinkedIn', '')
delete = delete.replace(' - LinkedIn', '')
delete = delete.replace(' profiles ', '')
delete = delete.replace('LinkedIn', '')
delete = delete.replace('|', '')
delete = delete.replace('"', '')
delete = delete.replace('>', '')
delete = delete.strip("-")
if delete != " ":
results.append(delete)
return results
if __name__ == "__main__":
ls = LinkedinScraper(keyword="Tesla",limit=100)
ls.search()
links = ls.parse_links()
print(links)
profiles = ls.parse_people()
print(*profiles,sep="\n")

Beautiful soup web scraping returning None-Python

I have a list of movies that I want to scrap the genres from Google.
I've built this code:
import requests
from bs4 import BeautifulSoup
list=['Se7en','Cinema Paradiso','The Shining','Toy Story 3','Capernaum']
gen2 = {}
for i in list:
user_query = i +'movie genre'
URL = 'https://www.google.co.in/search?q=' + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
c = soup.find(class_='EDblX DAVP1')
print(c)
if c != None:
genres = c.findAll('a')
gen2[i]= genres
But it returns an empty dict, so I checked one by one and it worked, for example:
import requests
from bs4 import BeautifulSoup
user_query = 'Se7en movie genre'
URL = "https://www.google.co.in/search?q=" + user_query
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.63 Safari/537.36'}
page = requests.get(URL, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')
v = soup.find(class_='KKHQ8c')
h = {}
genres = v.findAll('a')
for genre in genres:
h['Se7en']=genre
So I find out that in the for loop the variable c is returning None.
I can't figure out why! It only return None inside the loop.
Currently, your URLs are of the form
URLs
so the returned results(google) aren't accurate for all the movies.
You can change it to
`for i in list:
i="+".join(i.split(" "));
user_query = i + "+movie+genre"
URL = 'https://www.google.com/search?q=+'+user_query`
also, movies that belong to a single genre like Cinema Paradiso are in a div with class name "Z0LcW".

How can I get the data from a span in BeautifulSoup?

This is my code, I want to take the location's name and link, the variable "lugares" finds multiple item-containers, but I only want the first one [0]; then goes the for loop, but I can't find the span classes.
from bs4 import BeautifulSoup
import requests
b=[]
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugares = soup.find_all("div",{"class":"items-container"})
lugares=lugares[0]
print(len(lugares))
for lugar in lugares:
locationlink = i + str(lugar.find("span",{"class":"item"}).find("a")["href"])
location= lugar.find("span",{"class":"item"}).text
a=[location,locationlink]
b.append(a)
There are multiple options to get the goal, best one depence on what you expect and wanna do with this information in follow up process.
First Option
If you are just looking for the infos of first location you can do the following:
lugar = soup.select_one('div.items-container a')
b = [lugar.text, f'{i}{lugar["href"]}']
or
lugar = soup.select('div.items-container a')[0]
b = [lugar.text, f'{i}{lugar["href"]}']
Both select the first <a> in the <div> with class items-container.
Output
['Huixquilucan','https://www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1']
Alternativ
If you are interested to get all at once, you should use a list of dicts, so later on you just have to iterate it and get all information in place:
[{'name':x.text, 'link':f'{i}{x["href"]}'} for x in soup.select('div.items-container a')]
Output
[{'name': 'Huixquilucan',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1'},
{'name': 'Naucalpan',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/naucalpan/v1c1098l10710p1'},
{'name': 'Atizapán',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/atizapan/v1c1098l10662p1'},
{'name': 'Metepec',
'link': 'https://www.vivanuncios.com.mx/s-renta-inmuebles/metepec-edomex/v1c1098l10707p1'},...]
Example (showing results of both)
from bs4 import BeautifulSoup
import requests
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugar = soup.select_one('div.items-container a')
b = [lugar.text, f'{i}{lugar["href"]}']
print(f'First lugar:\n {b} \n')
## or alternative option
allLugaros = [{'name':x.text, 'link':f'{i}{x["href"]}'} for x in soup.select('div.items-container a')]
print(f'First lugar from lugaros (list of dict):\n {allLugaros[0]} \n')
print(f'All lugaros as list of dict:\n {allLugaros} \n')
First, you need to get all spans in the first Lugares lugares[0].
Then you need to iterate for each span to get the link and text for each location.
The Code:
from bs4 import BeautifulSoup
import requests
b=[]
i="https://www.vivanuncios.com.mx"
url = "https://www.vivanuncios.com.mx/s-renta-inmuebles/estado-de-mexico/v1c1098l1014p1"
encabezado = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36",'Accept-Language': 'en-US, en;q=0.5'}
page =requests.get(url,headers=encabezado)
soup = BeautifulSoup(page.content,"html.parser")
lugares = soup.find_all("div",{"class":"items-container"})
#lugares=lugares[0]
print(len(lugares))
# get all spans
spans = lugares[0].find_all("span",{"class":"item"})
# itreate throw each span
for span in spans:
# get location text
location = span.find("a").text
# locationlink builder
site = "www.vivanuncios.com.mx"
link = span.find("a")["href"]
locationlink = f"{site}{link}"
a = [location,locationlink]
b.append(a)
print (b[0])
Output:
['Huixquilucan', 'www.vivanuncios.com.mx/s-renta-inmuebles/huixquilucan/v1c1098l10689p1']

How to get all URLs within a page fom oddsportal?

I have a code that scrapes all URLs from oddsportal.com main page.
I want the subsequent links to all pages within the parent URL
e.g.
https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/
has further pages i.e. https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/, https://www.oddsportal.com/soccer/africa/africa-cup-of-nations-2019/results/, etc.
How can I get that?
My existing code:
import requests
import bs4 as bs
import pandas as pd
url = 'https://www.oddsportal.com/results/#soccer'
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
resp = requests.get(url, headers=headers)
soup = bs.BeautifulSoup(resp.text, 'html.parser')
base_url = 'https://www.oddsportal.com'
a = soup.findAll('a', attrs={'foo': 'f'})
# This set will have all the URLs of the main page
s = set()
for i in a:
s.add(base_url + i['href'])
s = list(s)
# This will filter for all soccer URLs
s = [x for x in s if '/soccer/' in x]
s = pd.DataFrame(s)
print(s)
I am very new to webscraping and hence this question.
You can find main_div tag based on class attribute and use find_all method to get a tag by looping over it you can extract href of it
from bs4 import BeautifulSoup
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36'}
source = requests.get("https://www.oddsportal.com/soccer/africa/africa-cup-of-nations/results/",headers=headers)
soup = BeautifulSoup(source.text, 'html.parser')
main_div=soup.find("div",class_="main-menu2 main-menu-gray")
a_tag=main_div.find_all("a")
for i in a_tag:
print(i['href'])
Output:
/soccer/africa/africa-cup-of-nations/results/
/soccer/africa/africa-cup-of-nations-2019/results/
/soccer/africa/africa-cup-of-nations-2017/results/
/soccer/africa/africa-cup-of-nations-2015/results/
/soccer/africa/africa-cup-of-nations-2013/results/
/soccer/africa/africa-cup-of-nations-2012/results/
/soccer/africa/africa-cup-of-nations-2010/results/
/soccer/africa/africa-cup-of-nations-2008/results/

How to grab spot price from yahoo finance using BeautifulSoup

I'm trying to grab the spot price of the SPY ETF: https://finance.yahoo.com/quote/SPY/options
I've mostly tried using soup.find_all, using the nested 'div' tags:
from bs4 import BeautifulSoup
import urllib.request
url = 'https://finance.yahoo.com/quote/SPY/options/'
source = urllib.request.urlopen(url).read()
soup = BeautifulSoup(source,'lxml')
for div in soup.find_all('div', class_ = "My(6px) smartphone_Mt(15px)"):
print(div.text)
for div in soup.find_all('div', class_ = "D(ib) Maw(65%) Ov(h)"):
print(div.text)
for div in soup.find_all('div', class_ = "D(ib) Mend(20px)"):
print(div.text)
Nothing is printed. I also tried the following:
print(soup.find('span', attrs = {'data-reactid':"35"}).text)
which results in 'Last Price' being printed. Now obviously I want the last price, rather than the words 'last price', but this is closer.
Nested in that span tag is some html which includes the number I want. I'm guessing the correct answer has to do with the 'react text: 36' stuff within the span tag (can't type it without stackoverflow thinking I'm trying to actually implement the html into this question).
If you just want the price:
import urllib.request
from bs4 import BeautifulSoup, Comment
page = urllib.request.urlopen("https://finance.yahoo.com/quote/SPY?p=SPY")
content = page.read().decode('utf-8')
soup = BeautifulSoup(content, 'html.parser')
comments = soup.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
price = soup.find("span", {"data-reactid": "14", "class" : "Trsdu(0.3s) "}).text
print(price)
Outputs:
271.40
I recommend to you use scrapy, requests modules
import requests
from bs4 import BeautifulSoup
from scrapy.selector import Selector
ajanlar = [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)']
url = "https://finance.yahoo.com/quote/SPY/options"
headers = {"User-Agent":random.choice(ajanlar)}
response = requests.get(url,headers=headers,proxies=None)
soup = BeautifulSoup(response.text, 'lxml')
xpath1 = "normalize-space(//div[#class='Mt(6px) smartphone_Mt(15px)'])"
xpath2 = "normalize-space(//div[#class='D(ib) Maw(65%) Maw(70%)--tab768 Ov(h)'])"
xpath3 = "normalize-space(//div[#class='D(ib) Mend(20px)'])"
var1 = Selector(text=response.text).xpath(xpath1).extract()[0]
var2 = Selector(text=response.text).xpath(xpath2).extract()[0]
var3 = Selector(text=response.text).xpath(xpath3).extract()[0]
print(var1)
print(var2)
print(var3)
Outputs:
269.97-1.43 (-0.53%)At close: 4:00PM EST269.61 -0.44 (-0.16%)After hours: 6:08PM ESTPeople also watchDIAIWMQQQXLFGLD
269.97-1.43 (-0.53%)At close: 4:00PM EST269.61 -0.44 (-0.16%)After hours: 6:08PM EST
269.97-1.43 (-0.53%)At close: 4:00PM EST
After than, you could apply regex

Categories

Resources