How to scrape this site for topics that I answer in

How to scrape this site for topics that I answer in - python

Question
How can I modify my script to successfully show me the number of answers that I have made by topics.
Code
This was a script that I tried
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def get_topics('Juan-Gallardo'):
url = "http://www.quora.com/" + 'Juan-Gallardo' + "/topics"
browser = webdriver.Chrome()
browser.get(url)
time.sleep(2)
bod = browser.find_element_by_tag_name("body")
no_of_pagedowns = 40
while no_of_pagedowns:
bod.send_keys(Keys.PAGE_DOWN)
time.sleep(0.3)
no_of_pagedowns-=1
topics = [t.text.encode('ascii', 'replace') for t in browser.find_elements_by_class_name("name_text")]
counts = [c.text.encode('ascii', 'replace').split(' ')[0] for c in browser.find_elements_by_class_name("name_meta")]
li = [[topics[i], int(counts[i])] for i in xrange(len(topics)) if counts[i] != '']
browser.quit()
return li
Errors

You need to define an argument for the get_topics() function:
def get_topics(user):
url = "http://www.quora.com/" + user + "/topics"
...
Then, call the function this way:
get_topics('Juan-Gallardo')

Related

Scraped data is not saving to csv file as it keeps returning a blank csv file

My scraper is calling the website and hitting each of the 44 pages and creating a csv file but the csv file is empty. I am returning after each of the functions and saving the data to a csv at the end of the scraper.
Can anyone see what is wrong with my code?
Code:
import pandas,requests,bs4,time
from seleniumwire import webdriver
from webdriver_manager.firefox import GeckoDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import datetime
TODAY = datetime.datetime.today().strftime("%Y%m%d")
SAVE_FILENAME = "/Users/180284/jupyter-1.0.0/pssi_jobs-"+TODAY+".csv"
driver = webdriver.Chrome('~/Desktop/chromedriver_mac64')
driver.implicitly_wait(30)
URL_BASE = "https://jobs.pssi.com/us/en/search-resultskeywords=%22food%20safety%20team%20member%22&s=1"
MAX_PAGE = 44
HEADERS = {
'From': 'myemail'
}
def interceptor(request):
del request.headers['From']
request.headers['From'] = HEADERS["From"]
driver.request_interceptor = interceptor
def parse_job_post_div(div_html):
soup = bs4.BeautifulSoup(div_html)
job_ls = soup.findAll("div",{"class":"information"})
job_data = []
for job in job_ls:
job_listing = job.find("div",{"class":"information"}).get_text(separator=", ").strip()
title = job.find("span",{"role":"heading"}).get_text(separator=", ").strip()
job_location = job.find("p",{"class":"job-info"}).get_text(separator=", ").strip()
new_row = {"job_listing":job,"title":title,"job_location":job_location}
job_data.append(new_row)
return job_data
def get_data(wd):
job_postings = driver.find_element(By.CLASS_NAME, "information")
html = job_postings.get_attribute("innerHTML")
parsed = parse_job_post_div(html)
return pandas.DataFrame(parsed)
def process_page(url):
driver.get(url)
master_data = []
i = 0
while True:
df = get_data(driver)
master_data.append(df)
if i == (MAX_PAGE - 1):
break
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
time.sleep(10)
print(i)
i+=1
return pandas.concat(master_data,ignore_index=True)
data = process_page(URL_BASE)
data.to_csv(SAVE_FILENAME)
`
I have tried the above code.

The first problem I found in your code is that the job_ls is an empty list, i.e. soup.findAll("div",{"class":"information"}) doesn't find anything.
Moreover, job_postings contains only one webelement (i.e. the first job of the list) instead of all 10 jobs shown in the page, that's because you used .find_element instead of .find_elements. As a result of these and other problems, process_page(URL_BASE) returns an empty dataframe.
In this case you can speed up the process and use less code using directly selenium instead of bs4
driver.get(URL_BASE)
driver.implicitly_wait(30)
MAX_PAGE = 4
titles, locations, descriptions = [], [], []
for i in range(MAX_PAGE):
print('current page:',i+1,end='\r')
titles += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > span[role=heading]')]
locations += [loc.text.replace('\n',', ') for loc in driver.find_elements(By.CSS_SELECTOR, '.information > p[class=job-info]')]
descriptions += [title.text for title in driver.find_elements(By.CSS_SELECTOR, '.information > p[data-ph-at-id=jobdescription-text')]
if i < MAX_PAGE-1:
driver.find_element(By.XPATH, "//span[#class='icon icon-arrow-right']").click()
else:
break
df = pandas.DataFrame({'title':titles,'location':locations,'description':descriptions})
df.to_csv(SAVE_FILENAME, index=False)
and df will be something like

how do i find a div element by searching in selenium, and then copying an attribute from that div using selenium and python?

what i am trying to do is getting the asin (an attribute) from a div (element) in html to then concatenate with amazon.com/dp/ + asin to form a url which is to then be visited. the divs have no id but are identified by the data-index="1" attribute within the div element so i am wondering how to call this div element to then be specifically searched for the asin attribute. thanks for reading.
using python 3.7 and selenium webdriver
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
email = ('.')
password = ('.')
query = ('macbook')
urls = []
prices = []
names = []
descs = []
def search_amazon(query):
driver.get('https://amazon.com/')
searchBox = driver.find_element_by_id('twotabsearchtextbox')
time.sleep(2)
searchBox.send_keys(query)
searchBox.send_keys(Keys.ENTER)
time.sleep(3)
firstResult = driver.find_element_by_name('data-index="1"')
asin = firstResult.getAttribute('data-asin')
print(asin)
url = 'https://amazon.com/dp/' + asin
driver.get(url)
print(url)
return url
search_amazon(query)

You need to change these two lines of code with the code I have provided.
firstResult = driver.find_element_by_name('data-index="1"')
asin = firstResult.getAttribute('data-asin')
Since data-index is not the name its an attribute.You can use following css selector.
firstResult = driver.find_element_by_css_selector('div[data-index="1"]>div')
asin = firstResult.get_attribute('data-asin')
Here is the working code.
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
driver = webdriver.Chrome()
email = ('.')
password = ('.')
query = ('macbook')
urls = []
prices = []
names = []
descs = []
def search_amazon(query):
driver.get('https://amazon.com/')
searchBox = driver.find_element_by_id('twotabsearchtextbox')
time.sleep(2)
searchBox.send_keys(query)
searchBox.send_keys(Keys.ENTER)
time.sleep(3)
firstResult = driver.find_element_by_css_selector('div[data-index="1"]>div')
asin = firstResult.get_attribute('data-asin')
print(asin)
url = 'https://amazon.com/dp/' + asin
driver.get(url)
print(url)
return url
search_amazon(query)

Webdriver/BeautifulSoup Getting my program to check if part of a string exists on the webpage

I am writing a bot that purchases items automatically. The current way I am going about this is I am putting the product info in a dictionary titled INFO, and referencing it whenever I need a specific product/color/etc.
Currently my code (specifically in findProduct()) checks to see if the index in temp_tuple is the same as INFO['product'] for instance.
In my case, I look for a product and my code returns an error because there is a space at the end of some of the names, and my code cannot handle that.
However, I want to modify it to is check whether or not the string is on the webpage so that way my code runs even with that extra space.
Enough of my code that works as it is:
#!/usr/bin/env python3
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import time
import requests
import bs4 as bs
from splinter import Browser
import helpers
from selenium.common.exceptions import ElementNotInteractableException
from config import INFO
def __init__(self, **info):
self.base_url = 'http://www.supremenewyork.com/'
self.shop = 'shop/all/'
self.checkout = 'checkout/'
self.info = info
class supremeBot(object):
def __init__(self, **info):
self.base_url = 'http://www.supremenewyork.com/'
self.shop = 'shop/all/'
self.info = info
def initializeBrowser(self):
driverz = self.info["driver"]
path = helpers.get_driver_path(driver)
if driverz == "geckodriver":
self.b = Browser()
elif driverz == "chromedriver":
executable_path = {"executable_path": path}
self.b = Browser('chrome', **executable_path)
#This looks for the product based on what the category is
def findProduct(self):
category = str(INFO['category'])
source = requests.get("http://www.supremenewyork.com/shop/all/"+category).text
soup = bs.BeautifulSoup(source, 'lxml')
temp_link = []
temp_tuple = []
for link in soup.find_all('a', href=True):
temp_tuple.append((link['href'], link.text))
for i in temp_tuple:
if i[1] == INFO['product'] or i[1] == INFO['color']: # <------------ I want this to recognize a partial string
temp_link.append(i[0])
#print(temp_link)
#This creates end of the final link
self.final_link = list(
set([x for x in temp_link if temp_link.count(x) == 2]))[0]
#Concatenates the previous link w/ the website
link = 'http://www.supremenewyork.com'+str(self.final_link)
driver.get(link)
if __name__ == "__main__":
driver = webdriver.Chrome('./chromedriver')
'''
BOT = supremeBot(**INFO)
BOT.findProduct()
order()
'''
BOT = supremeBot(**INFO)
found_product = False
counter = 1
max_iter = 5
while not found_product and counter < max_iter:
found_product = BOT.findProduct()
print("We tried ",counter," times.")
counter +=1
if found_product:
print('Couldn\'t find it')
continue
else:
print('found it')
order()
INFO = {
"driver": "chromedriver",
"product": "Supreme®/MLB New Era®", # "Big Duffle Bag " is an example of a product that has the space after it
"color": "Navy",
"category": "hats",
"size": "Medium",
"namefield": "Bucky McNuts",
"emailfield": "email#email.com",
"phonefield": "(555)555-5555",
"addressfield": "321 St",
}
In this case, if you were to replace Supreme®/MLB New Era® with "Big Duffle Bag " you'll see the code doesn't run if you removed the space after the word bag.
If anybody could help I would really appreciate it!

You can do this check for partial string:
if "part" in "partstring":
print("the word 'part' is within 'partsting'")
Possible use here:
if INFO['product'] in i[1].lower() or INFO['color'] in i[1].lower():
#do something
The .lower() is to make sure the text on the site is lower case

Selenium Python webscraper really slow

I'm a newbie getting into web scrapers. I've made something that works, but it takes hours and hours to get everything I need. I read something about using parallel processes to process the URLs but I have no clue how to go about it and incorporate it in what I already have. Help is much appreciated!
Here is my, still extremely messy, code. I'm still learning :)
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
from selenium.common.exceptions import NoSuchElementException
import time
import random
import pprint
import itertools
import csv
import pandas as pd
start_url = "https://www.nationalevacaturebank.nl/vacature/zoeken?query=&location=&distance=city&limit=100&sort=relevance&filters%5BcareerLevel%5D%5B%5D=Starter&filters%5BeducationLevel%5D%5B%5D=MBO"
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
driver.get(start_url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
wait = WebDriverWait(driver, random.randint(1500,3200)/1000.0)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
num_jobs = int(driver.find_element_by_xpath('/html/body/div[3]/div/main/div[2]/div[3]/div/header/h2/span').text)
num_pages = int(num_jobs/102)
urls = []
list_of_links = []
for i in range(num_pages+1):
try:
elements = wait.until(EC.presence_of_all_elements_located((By.XPATH, '//*[#id="search-results-container"]//article/job/a')))
for i in elements:
list_of_links.append(i.get_attribute('href'))
j = random.randint(1500,3200)/1000.0
time.sleep(j)
if 'page=3' not in driver.current_url:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[6]/a').click()
else:
driver.find_element_by_xpath('//html/body/div[3]/div/main/div[2]/div[3]/div/paginator/div/nav[1]/ul/li[5]/a').click()
url = driver.current_url
if url not in urls:
print(url)
urls.append(url)
else:
break
except:
continue
set_list_of_links = list(set(list_of_links))
print(len(set_list_of_links), "results")
driver.close()
def grouper(n, iterable):
it = iter(iterable)
while True:
chunk = tuple(itertools.islice(it, n))
if not chunk:
return
yield chunk
def remove_empty_lists(l):
keep_going = True
prev_l = l
while keep_going:
new_l = remover(prev_l)
#are they identical objects?
if new_l == prev_l:
keep_going = False
#set prev to new
prev_l = new_l
#return the result
return new_l
def remover(l):
newlist = []
for i in l:
if isinstance(i, list) and len(i) != 0:
newlist.append(remover(i))
if not isinstance(i, list):
newlist.append(i)
return newlist
vacatures = []
chunks = grouper(100, set_list_of_links)
chunk_count = 0
for chunk in chunks:
chunk_count +=1
print(chunk_count)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
for url in chunk:
driver = webdriver.Firefox()
driver.set_page_load_timeout(20)
try:
driver.get(url)
driver.find_element_by_xpath('//*[#id="form_save"]').click() #accepts cookies
vacature = []
vacature.append(url)
j = random.randint(1500,3200)/1000.0
time.sleep(j)
elements = driver.find_elements_by_tag_name('dl')
p_elements = driver.find_elements_by_tag_name('p')
li_elements = driver.find_elements_by_tag_name('li')
for i in elements:
if "Salaris:" not in i.text:
vacature.append(i.text)
running_text = list()
for p in p_elements:
running_text.append(p.text)
text= [''.join(running_text)]
remove_ls = ['vacatures', 'carrièretips', 'help', 'inloggen', 'inschrijven', 'Bezoek website', 'YouTube',
'Over Nationale Vacaturebank', 'Werken bij de Persgroep', 'Persberichten', 'Autotrack', 'Tweakers',
'Tweakers Elect', 'ITBanen', 'Contact', 'Carrière Mentors', 'Veelgestelde vragen',
'Vacatures, stages en bijbanen', 'Bruto Netto Calculator', 'Salariswijzer', 'Direct vacature plaatsen',
'Kandidaten zoeken', 'Bekijk de webshop', 'Intermediair', 'Volg ons op Facebook']
for li in li_elements:
if li.text not in remove_ls:
text.append(li.text)
text = ''. join(text)
vacature.append(text)
vacatures.append(vacature)
driver.close()
except TimeoutException as ex:
isrunning = 0
print("Exception has been thrown. " + str(ex))
driver.close()
except NoSuchElementException:
continue

Python Selenium webdriver is not thread-safe. This means your browser can not correctly consume asynchronous calls from multiple threads. Try to scrape websites with requests and bs4 + lxml. It's much faster than Selenium. This answer can be helpful.

You're using Firefox which is slower than Chrome in almost all real-life applications.
Xpath is the slowest selector, match by id or class. If that is not possible then by CSS.
Use headless mode and don't load images unless you need to.

You can use Scrapy and this is much faster and more flexible than anything. See link for more information.

how to make scrapy crawler work properly when dealing with multilevel webpage crawling

I am learning crawling skills, and I want to do as follows:
login to a specific webpage (done)
go to a page that contains the links that I need
for each link in that page, crawl its content.
The problem is I have tested my code for a single link, it worked, but when I tried it for the multilevel job. It failed in a way I could not understand: It can only crawl some part of each link. I am wondering if there is some logical mistake in my code, please help. Below is the code
import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
class BaiduSpider(scrapy.Spider):
name = 'baidu'
allowed_domains = ['baidu.com']
start_urls = ['http://tieba.baidu.com']
main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username = ""
password = ""
def __init__(self, username=username, password=password):
#options = webdriver.ChromeOptions()
#options.add_argument('headless')
#options.add_argument('window-size=1200x600')
self.driver = webdriver.Chrome()#chrome_options=options)
self.username = username
self.password = password
# checked
def logIn(self):
elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
elem.click()
wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
elem.click()
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
elem.send_keys(self.username)
elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
elem.send_keys(self.password)
self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
self.driver.get(response.url)
self.logIn()
# wait for hand input verify code
time.sleep(20)
self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
# try first page first
for url in self.driver.find_elements_by_css_selector('a.j_th_tit'):
#new_url = response.urljoin(url)
new_url = url.get_attribute("href")
yield scrapy.Request(url=new_url, callback=self.parse_sub)
# checked
def pageScroll(self, url):
self.log('I am scrolling' + url)
self.driver.get(url)
SCROLL_PAUSE_TIME = 0.5
SCROLL_LENGTH = 1200
page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
scrollPosition = 0
while scrollPosition < page_height:
scrollPosition = scrollPosition + SCROLL_LENGTH
self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
time.sleep(SCROLL_PAUSE_TIME)
time.sleep(1.2)
def parse_sub(self, response):
self.log('I visited ' + response.url)
self.pageScroll(response.url)
for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
name = sel.find_element_by_css_selector('.d_name').text
try:
content = sel.find_element_by_css_selector('.j_d_post_content').text
except: content = ''
replys = []
for i in sel.find_elements_by_xpath('.//div[#class="lzl_cnt"]'):
user1 = i.find_element_by_xpath('.//a[#username]')
user1 = self.driver.execute_script("return arguments[0].firstChild.textContent", user1)
try:
user2 = i.find_element_by_xpath('.//span[#class="lzl_content_main"]/a[#username]')
user2 = self.driver.execute_script("return arguments[0].firstChild.textContent", user2)
except: user2 = name
span = i.find_element_by_xpath('.//span[#class="lzl_content_main"]')
reply = self.driver.execute_script('return arguments[0].lastChild.textContent;', span)
replys.append(tuple(user1, user2, reply))
yield {"topic": response.css(".core_title_txt::text").extract(), "name":name, "content":content, "replys":replys}
#follow to next page
#next_sel = self.driver.find_element_by_css_selector('#thread_theme_7 a:nth-child(3)')
#next_url_name = next_sel.text
#if next_sel and next_url_name == '下一页':
# next_url = next_sel.get_attribute('href')
# yield scrapy.Request(url=next_url, callback=self.parse_sub)

Seems like you are using a hardcoded container for the link instead of a generic one and hence getting back just one link in
for url in self.driver.find_elements_by_css_selector('a.j_th_tit')
This - j_th_tit - seems to be a dynamically generated class name and might not be the same for all anchor(a) tags.
You could try
for url in self.driver.find_elements_by_css_selector('a')
for getting all links of the page.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to scrape this site for topics that I answer in - python

You need to define an argument for the get_topics() function: def get_topics(user): url = "http://www.quora.com/" + user + "/topics" ... Then, call the function this way: get_topics('Juan-Gallardo')

Related

Scraped data is not saving to csv file as it keeps returning a blank csv file

how do i find a div element by searching in selenium, and then copying an attribute from that div using selenium and python?

Webdriver/BeautifulSoup Getting my program to check if part of a string exists on the webpage

Selenium Python webscraper really slow

how to make scrapy crawler work properly when dealing with multilevel webpage crawling

Categories

Resources