scrape replies to a tweet using Python Jupyter Notebook - python

I've seen some questions and posts on how to scrape tweets of a specific handle, but not on how to do so to get all the replies to a particular tweet using Python via Jupyter Notebook.
Example: I want to scrape and export to Excel all the 340 replies to this public BBC tweet "Microplastics found in fresh Antarctic snow for the first time" (https://twitter.com/BBCWorld/status/1534777385249390593)
I need the following info: Reply date, Reply to (so I only get the replies to BBC, and not to other users in this thread) and the Reply text.
Inspecting the elements of the URL, I see that the reply container's class is named: css-1dbjc4n. Likewise:
The Reply date's class is: css-1dbjc4n r-1loqt21 r-18u37iz r-1ny4l3l r-1udh08x r-1qhn6m8 r-i023vh r-o7ynqc r-6416eg
The Reply to's class is: css-4rbku5 css-18t94o4 css-901oao r-14j79pv r-1loqt21 r-1q142lx r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-3s2u2q r-qvutc0
And the Reply text's class is: css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0
I have tried to run the code below, but the list remains empty :(
Results so far:
Empty DataFrame
Columns: [Date of Tweet, Replying to, Tweet]
Index: []
Can anyone help me, please?
Many thanks! :)
Code:
import sys
sys.path.append("path to site-packages in your pc")
from selenium import webdriver
from bs4 import BeautifulSoup
import pandas as pd
driver = webdriver.Chrome(executable_path=r"C:chromedriver path in your pc")
dates=[] #List to store date of tweet
replies=[] #List to store reply to info
comments=[] #List to store comments
driver.get("https://twitter.com/BBCWorld/status/1534777385249390593")
twts=[]
content = driver.page_source
soup = BeautifulSoup(content)
for a in soup.findAll('div',href=True, attrs={'class':'css-1dbjc4n'}):
datetweet=a.find('div', attrs={'class':'css-1dbjc4n r-1loqt21 r-18u37iz r-1ny4l3l r-1udh08x r-1qhn6m8 r-i023vh r-o7ynqc r-6416eg'})
replytweet=a.find('div', attrs={'class':'css-4rbku5 css-18t94o4 css-901oao r-14j79pv r-1loqt21 r-1q142lx r-37j5jr r-a023e6 r-16dba41 r-rjixqe r-bcqeeo r-3s2u2q r-qvutc0'})
commenttweet=a.find('div', attrs={'class':'css-901oao css-16my406 r-poiln3 r-bcqeeo r-qvutc0'})
dates.append(datetweet.text)
replies.append(replytweet.text)
comments.append(commenttweet.text)
df = pd.DataFrame({'Date of Tweet':dates,'Replying to':replies,'Tweet':comments})
df.to_csv('tweets.csv', index=False, encoding='utf-8')
print(df)

I found two problems:
page uses JavaScript to add elements and JavaScript may need time to add all elements to HTML - you may need time.sleep(...) before you get driver.page_source. OR use waits in Selenium to wait for some objects (before you get driver.page_source).
HTML doesn't use <div href="..."> so your findAll('div', href=True, ...) is wrong. You have to remove href=True
EDIT:
I put code which I created but it needs also to scroll page to get more tweets. And later it may need to click Show more replies to get even more tweets.
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
#from webdriver_manager.chrome import ChromeDriverManager
from webdriver_manager.firefox import GeckoDriverManager
import pandas as pd
import time
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
driver.get("https://twitter.com/BBCWorld/status/1534777385249390593")
time.sleep(10)
# TODO: scroll page to get more tweets
#for _ in range(2):
# last = driver.find_elements(By.XPATH, '//div[#data-testid="cellInnerDiv"]')[-1]
# driver.execute_script("arguments[0].scrollIntoView(true)", last)
# time.sleep(3)
all_tweets = driver.find_elements(By.XPATH, '//div[#data-testid]//article[#data-testid="tweet"]')
tweets = []
print(len(all_tweets)-1)
for item in all_tweets[1:]: # skip first tweet because it is BBC tweet
#print('--- item ---')
#print(item.text)
print('--- date ---')
try:
date = item.find_element(By.XPATH, './/time').text
except:
date = '[empty]'
print(date)
print('--- text ---')
try:
text = item.find_element(By.XPATH, './/div[#data-testid="tweetText"]').text
except:
text = '[empty]'
print(text)
print('--- replying_to ---')
try:
replying_to = item.find_element(By.XPATH, './/div[contains(text(), "Replying to")]//a').text
except:
replying_to = '[empty]'
print(replying_to)
tweets.append([date, replying_to, text])
df = pd.DataFrame(tweets, columns=['Date of Tweet', 'Replying to', 'Tweet'])
df.to_csv('tweets.csv', index=False, encoding='utf-8')
print(df)

Related

Scraping: can't get stable results

I'm doing a scraping exercise on a job searching webpage. I want to get the link, name of the company, job title, salary, location and posting date. I've run the same code multiple times, and sometimes it gives the expected results in the salary item (salary if the info is displayed, "N/A" otherwise) and sometimes it gives me something different: salary if the info is displayed, "N/A", and some random character values in columns whose values should be "N/A". I have no problems with the other elements. Here is my code:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import pandas as pd
import requests
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
driver.get('https://ca.indeed.com/')
#Inputs a job title and location into the input boxes
input_box = driver.find_element(By.XPATH,'//*[#id="text-input-what"]')
input_box.send_keys('data analyst')
location = driver.find_element(By.XPATH,'//*[#id="text-input-where"]')
location.send_keys('toronto')
#Clicks on the search button
button = driver.find_element(By.XPATH,'//*[#id="jobsearch"]/button').click()
#Creates a dataframe
df = pd.DataFrame({'Link':[''], 'Job Title':[''], 'Company':[''], 'Location':[''],'Salary':[''], 'Date':['']})
#This loop goes through every page and grabs all the details of each posting
#Loop will only end when there are no more pages to go through
while True:
#Imports the HTML of the current page into python
soup = BeautifulSoup(driver.page_source, 'lxml')
#Grabs the HTML of each posting
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
#grabs all the details for each posting and adds it as a row to the dataframe
for post in postings:
link = post.find('a').get('href')
link_full = 'https://ca.indeed.com'+link
name = post.find('h2', tabindex = '-1').text.strip()
company = post.find('span', class_ = 'companyName').text.strip()
try:
location = post.find('div', class_ = 'companyLocation').text.strip()
except:
location = 'N/A'
try:
salary = post.find('div', attrs = {'class':'heading6 tapItem-gutter metadataContainer noJEMChips salaryOnly'}).text.strip()
except:
salary = 'N/A'
date = post.find('span', class_ = 'date').text.strip()
df = df.append({'Link':link_full, 'Job Title':name, 'Company':company, 'Location':location,'Salary':salary, 'Date':date},
ignore_index = True)
#checks if there is a button to go to the next page, and if not will stop the loop
try:
button = soup.find('a', attrs = {'aria-label': 'Next'}).get('href')
driver.get('https://ca.indeed.com'+button)
except:
break
Can I fix my code to get the expected results everytime I run it? Also, an additional issue: I'm scraping around 60 pages. But usually the program stops between 20 and 30 pages before the last page. Is there a way to fix the code so that it scrapes until the last page everytime?
Here is a simplified example with requests library:
import requests
from bs4 import BeautifulSoup
cookies = {}
headers = {}
params = {
'q': 'data analyst',
'l': 'toronto',
'from': 'searchOnHP',
}
response = requests.get('https://ca.indeed.com/jobs', params=params, cookies=cookies, headers=headers)
soup = BeautifulSoup(response.text)
postings = soup.find_all('div', class_ = 'slider_container css-g7s71f eu4oa1w0')
len(postings)
prints
15

Beautifulsoup/Selenium how to scrape website until next page is disabled?

So I have a list of urls (called "data") that contains urls like
https://www.amazon.com/Airpods-Fashion-Protective-Accessories-Silicone/product-reviews/B08YD8JLNQ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
and
https://www.amazon.com/Keychain-R-fun-Protective-Accessories-Visible-Sky/product-reviews/B082W7DL1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
Some urls do not have the "Next Page" icon and some do. So far my code is something like this
from bs4 import BeautifulSoup
import requests
import csv
import os
import pandas as pd
from selenium import webdriver
from selenium.common import exceptions
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import StaleElementReferenceException
df = pd.read_csv(r'path to csv file', sep=',', usecols=['Url'], squeeze=True)
data = pd.read_csv(r'path to csv file', sep=',', usecols=['Url'], squeeze=True)
rows = []
for url in data
page = requests.get(url)
soup = bs(page.content, 'html.parser')
soup.prettify
#names = soup.find_all('span', class="a-profile-name")
# div.celwidget div.aok-relative span.a-profile-name
#names = soup.find_all('div.celwidget div.aok-relative span', class= "a-profile-name")
names = soup.find_all('div.celwidget div.aok-relative span.a-profile-name')
rating = soup.find_all('div.celwidget div.aok-relative span.a-icon-alt')
title = soup.find_all('div.celwidget div.aok-relative a.a-text-bold span')
content = soup.find_all('div.celwidget div.aok-relative span.review-text-content span')
I want to scrape the names, ratings and etc from the reviews until the last item where the Next Page button would be disabled.
I'm not quite sure what to do from here, I looked around and many questions related to this was using .click() on Next Page which I don't think is the answer I need/want.
The next page url is stored in a list item with class name a-last. So you could create a while loop that breaks if soup.find('li', class_='a-last') returns nothing anymore (i.e. if the last page has been reached):
from selenium import webdriver
from bs4 import BeautifulSoup
import time
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
url='https://www.amazon.com/Keychain-R-fun-Protective-Accessories-Visible-Sky/product-reviews/B082W7DL1R/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews' #or https://www.amazon.com/s?k=maison+kitsune+airpod+pro+case
wd = webdriver.Chrome('chromedriver',options=options)
while True:
wd.get(url)
soup = BeautifulSoup(wd.page_source, "html.parser")
#store data here
try:
url = 'https://www.amazon.com/' + soup.find('li', class_='a-last').find('a', href=True)['href']
time.sleep(2) #prevent ban
except:
break

Data Mining IMDB Reviews - Only extracting the first 25 reviews

I am currently trying to extract all the reviews on Spiderman Homecoming movie but I am only able to get the first 25 reviews. I was able to load more in IMDB to get all the reviews as originally it only shows the first 25 but for some reason I am unable to mine all the reviews after every review has been loaded. Does anyone know what I am doing wrong?
Below is the code I am running:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#Set the web browser
driver = webdriver.Chrome(executable_path=r"C:\Users\Kent_\Desktop\WorkStudy\chromedriver.exe")
#Go to Google
driver.get("https://www.imdb.com/title/tt6320628/reviews?ref_=tt_urv")
#Loop load more button
wait = WebDriverWait(driver,10)
while True:
try:
driver.find_element_by_css_selector("button#load-more-trigger").click()
wait.until(EC.invisibility_of_element_located((By.CSS_SELECTOR,".ipl-load-more__load-indicator")))
soup = BeautifulSoup(driver.page_source, 'lxml')
except Exception:break
#Scrape IMBD review
ans = driver.current_url
page = requests.get(ans)
soup = BeautifulSoup(page.content, "html.parser")
all = soup.find(id="main")
#Get the title of the movie
all = soup.find(id="main")
parent = all.find(class_ ="parent")
name = parent.find(itemprop = "name")
url = name.find(itemprop = 'url')
film_title = url.get_text()
print('Pass finding phase.....')
#Get the title of the review
title_rev = all.select(".title")
title = [t.get_text().replace("\n", "") for t in title_rev]
print('getting title of reviews and saving into a list')
#Get the review
review_rev = all.select(".content .text")
review = [r.get_text() for r in review_rev]
print('getting content of reviews and saving into a list')
#Make it into dataframe
table_review = pd.DataFrame({
"Title" : title,
"Review" : review
})
table_review.to_csv('Spiderman_Reviews.csv')
print(title)
print(review)
Well, actually, there's no need to use Selenium. The data is available via sending a GET request to the websites API in the following format:
https://www.imdb.com/title/tt6320628/reviews/_ajax?ref_=undefined&paginationKey=MY-KEY
where you have to provide a key for the paginationKey in the URL (...&paginationKey=MY-KEY)
The key is found in the class load-more-data:
<div class="load-more-data" data-key="g4wp7crmqizdeyyf72ux5nrurdsmqhjjtzpwzouokkd2gbzgpnt6uc23o4zvtmzlb4d46f2swblzkwbgicjmquogo5tx2">
</div>
So, to scrape all the reviews into a DataFrame, try:
import pandas as pd
import requests
from bs4 import BeautifulSoup
url = (
"https://www.imdb.com/title/tt6320628/reviews/_ajax?ref_=undefined&paginationKey={}"
)
key = ""
data = {"title": [], "review": []}
while True:
response = requests.get(url.format(key))
soup = BeautifulSoup(response.content, "html.parser")
# Find the pagination key
pagination_key = soup.find("div", class_="load-more-data")
if not pagination_key:
break
# Update the `key` variable in-order to scrape more reviews
key = pagination_key["data-key"]
for title, review in zip(
soup.find_all(class_="title"), soup.find_all(class_="text show-more__control")
):
data["title"].append(title.get_text(strip=True))
data["review"].append(review.get_text())
df = pd.DataFrame(data)
print(df)
Output (truncated):
title review
0 Terrific entertainment Spiderman: Far from Home is not intended to be...
1 THe illusion of the identity of Spider man. Great story in continuation of spider man home...
2 What Happened to the Bad Guys I believe that Quinten Beck/Mysterio got what ...
3 Spectacular One of the best if not the best Spider-Man mov...
...
...

is there a way to wait for an a tag element that could be one of two based on their href value in selenium?

So I'm not sure if this practically valid, but was wondering if there's a way in selenium to wait for an <a tag - out of two based on their href value or the text contained after the tag closes.
What I'm trying to do is to power up this page https://www.coingecko.com/en/exchanges, iterate through the exchanges links, visit each one of them, then click on the about tab of each of those newly opened pages as they contain the info to be extracted. The code actually worked up until halfway through when it failed to identify the tab properly through a StaleElementException and elementNotFound as I did it through driver.find_element_by_text.
The problem is that the 'about' tab changes from one page to the other, so it's either //ul[#role='tablist']/li[3] or li[2], and that's why I'm trying to wait and click on the right element based on its href value. That is since one of the a tags on the page href's value contains the text # about ---> //ul[#role='tablist']/li[3]/a
Apologies if it wasn't straightforward but I was trying to pinpoint what the issue was until recently :)
This is the code that I've attempted so far if anyone can gratefully point me in the right direction
from selenium.webdriver import Chrome
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from selenium.common.exceptions import NoSuchElementException, ElementNotVisibleException
webdriver = '/Users/karimnabil/projects/selenium_js/chromedriver-1'
driver = Chrome(webdriver)
num_of_pages = 4
exchanges_list = []
names_list = []
websites_list = []
emails_list = []
years_list = []
countries_list = []
twitter_list = []
for i in range(num_of_pages):
url = 'https://www.coingecko.com/en/exchanges?page=' + str(i+1)
driver.get(url)
links = driver.find_elements_by_xpath("//tbody[#data-target='exchanges-list.tableRows']/tr/td[2]/div/span[2]/a")
links = [url.get_attribute('href') for url in links]
time.sleep(0.5)
for link in links:
driver.get(link)
wait = WebDriverWait(driver, 2)
wait.until(EC.text_to_be_present_in_element_value((By.XPATH, "//ul[#role='tablist']/li[position()=2 or position()=3]/a"), '#about'))
try:
name = driver.find_element_by_xpath("//div[#class='exchange-details-header-content']/div/h1").text
website = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[8]/a").get_attribute('href')
email = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[9]/a").get_attribute('href')
year_est = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[10]").text
inc_country = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[12]").text
twitter = driver.find_element_by_xpath("//div[#class='row no-gutters']/div[16]/div[2]/div[2]/a").get_attribute('title')
except:
pass
try:
print('---------------')
print('exchange name is : {}'.format(name))
print('exchange website is : {}'.format(website))
print('exchange email is : {}'.format(email))
print('exchange established in year: {}'.format(year_est))
print('exchange incorporated in : {}'.format(Inc_country))
print('exchange twitter handle is: {}'.format(twitter))
except:
pass
try:
names_list.append(name)
websites_list.append(website)
emails_list.append(email)
years_list.append(year_est)
countries_list.append(Inc_country)
twitter_list.append(twitter)
except:
pass
df = pd.DataFrame(list(zip(names_list, websites_list,emails_list, years_list, countries_list, twitter_list)), columns=['Ex_Names', 'Website', 'Support Email', 'Inc Year', 'Inc Country', 'Twitter Handle' ])
CoinGecko2_data = df.to_csv('CoinGecko4.csv', index=False)
If you know the href just wait for: //a[contains(#href, 'my-href')]
I am not sue if there is any but you can create your custom wait. Here is an example:
https://seleniumbyexamples.github.io/waitcustom

I am Scraping multiple web pages which gives the same results as the first page in Python selenium. What would be the reason?

I am scraping goodreads.com using Selenium and Beautiful soup. I can able to get the results for the first page. When I give the URL for the second page then it loads the first page and gives the first page results only. I tried with different pages and all loads the first page only. What would be the reason and how to overcome this?
from selenium import webdriver
from bs4 import BeautifulSoup
from selenium.webdriver.firefox.options import Options
import pandas as pd
import time
import re
import requests
from itertools import zip_longest
from webdriver_manager.chrome import ChromeDriverManager
# First-page site URL: https://www.goodreads.com/shelf/show/business?page=1
driver = webdriver.Chrome(ChromeDriverManager().install())
# Reading the second page
driver.get("https://www.goodreads.com/shelf/show/non-fiction?page=2")
time.sleep(3)
summaryItems = driver.find_elements_by_xpath("//a[contains(#class, 'bookTitle')]")
job_links = [summaryItem.get_attribute("href") for summaryItem in summaryItems]
for job_link in job_links:
driver.get(job_link)
#Closing the pop-up window
try:
close = driver.find_elements_by_class_name('gr-iconButton')
close.click()
except:
close = "None"
try:
# Taking book description
more = driver.find_element_by_css_selector("#description > a:nth-child(3)").click()
soup = BeautifulSoup(driver.page_source, 'html.parser')
#for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
# print(item.text)
sections = soup.findAll("span", id=re.compile("^freeText"))[:2]
print("message ")
i = 0
for item in soup.findAll("span", id=re.compile("^freeText"))[:2]:
i = i+1
if i == 2:
desc.append(item.text)
except:
more = "None"
try: # Taking book title
# time.sleep(2)
job_title = driver.find_element_by_xpath("//h1[#class='gr-h1 gr-h1--serif']").text
#job_title = driver.find_element_by_id('bookTitle').find_element_by_class_name('gr-h1 gr-h1--serif').text
title.append(job_title)
#print(title)
except:
job_title = "None"
#Taking Author name
try:
# time.sleep(2)
authors = driver.find_element_by_xpath("//a[#class='authorName']").text
author.append(authors)
#print(author)
except:
authors = "None"
#Taking Ratings
html = driver.page_source
soup = BeautifulSoup(html, 'html.parser')
rate = soup.find("span", itemprop="ratingValue").text.strip()
rates = rate.replace('\n','')
rating.append(rates)
driver.close()
Output:
I am able to scrape book title, author name, book description, and rating for the first page only.
You should login first to scrap data on other page.
Try to add following code into your script:
driver = webdriver.Chrome(ChromeDriverManager().install())
# Add below code after webdriver.Chrome()
driver.get("https://www.goodreads.com/user/sign_in")
time.sleep(5)
driver.find_element_by_css_selector("#user_email").send_keys("your email")
driver.find_element_by_css_selector("#user_password").send_keys("your password")
driver.find_element_by_xpath("//input[#type='submit' and #value='Sign in']").click()

Categories

Resources