How to make my session.get() link into variable? - python

My goal is to scrape multiple profile links and then scrape specific data on each of these profiles.
Here is my code to get multiple profile links (it should work fine):
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import re
session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-science/algorithms/intro-to-algorithms/v/what-are-algorithms')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
profiles = soup.find_all(href=re.compile("/profile/kaid"))
for links in profiles:
links_no_list = links.extract()
text_link = links_no_list['href']
text_link_nodiscussion = text_link[:-10]
final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
print(final_profile_link)
Now here is my code to get the specific data on just one profile (it should work fine too):
from bs4 import BeautifulSoup
from requests_html import HTMLSession
session = HTMLSession()
import re
r = session.get('https://www.khanacademy.org/profile/Kkasparas/')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
user_info_table=soup.find('table', class_='user-statistics-table')
if user_info_table is not None:
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
else:
dates=points=videos='NA'
user_socio_table=soup.find_all('div', class_='discussion-stat')
data = {}
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number
full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks']
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'
user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
if user_calendar is not None:
#for getdate in user_calendar:
last_activity = user_calendar.find('span',class_='streak-cell filled')
last_activity_date = last_activity['title']
#print(last_activity)
#print(last_activity_date)
else:
last_activity_date='NA'
filename = "khanscrapetry1.csv"
f = open(filename, "w")
headers = "date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date\n"
f.write(headers)
f.write(dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "\n")
f.close()
My question is : how can I automate my scripts?
In other words: How can I merge these two scripts?
The goal is to create a sort of variable that is going to be a different profile link every time.
And then for each profile link to get the specific data and then put it into the csv file (a new row for each profile).

It is fairly very straight forward to do this. I instead of printing the profile links store them to a list variable. Then loop through the list variable to scrape each link and then write to the csv file. Some pages do not have all the details so you have to handle those exceptions as well. In the code below I have marked them also as 'NA', following the convention used in your code. One other note for future is to consider using the python's inbuilt csv module for reading and writing csv files.
Merged Script
from bs4 import BeautifulSoup
from requests_html import HTMLSession
import re
session = HTMLSession()
r = session.get('https://www.khanacademy.org/computing/computer-science/algorithms/intro-to-algorithms/v/what-are-algorithms')
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
profiles = soup.find_all(href=re.compile("/profile/kaid"))
profile_list=[]
for links in profiles:
links_no_list = links.extract()
text_link = links_no_list['href']
text_link_nodiscussion = text_link[:-10]
final_profile_link ='https://www.khanacademy.org'+text_link_nodiscussion
profile_list.append(final_profile_link)
filename = "khanscrapetry1.csv"
f = open(filename, "w")
headers = "date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date\n"
f.write(headers)
for link in profile_list:
print("Scraping ",link)
session = HTMLSession()
r = session.get(link)
r.html.render(sleep=5)
soup=BeautifulSoup(r.html.html,'html.parser')
user_info_table=soup.find('table', class_='user-statistics-table')
if user_info_table is not None:
dates,points,videos=[tr.find_all('td')[1].text for tr in user_info_table.find_all('tr')]
else:
dates=points=videos='NA'
user_socio_table=soup.find_all('div', class_='discussion-stat')
data = {}
for gettext in user_socio_table:
category = gettext.find('span')
category_text = category.text.strip()
number = category.previousSibling.strip()
data[category_text] = number
full_data_keys=['questions','votes','answers','flags raised','project help requests','project help replies','comments','tips and thanks']
for header_value in full_data_keys:
if header_value not in data.keys():
data[header_value]='NA'
user_calendar = soup.find('div',class_='streak-calendar-scroll-container')
if user_calendar is not None:
last_activity = user_calendar.find('span',class_='streak-cell filled')
try:
last_activity_date = last_activity['title']
except TypeError:
last_activity_date='NA'
else:
last_activity_date='NA'
f.write(dates + "," + points.replace("," , "") + "," + videos + "," + data['questions'] + "," + data['votes'] + "," + data['answers'] + "," + data['flags raised'] + "," + data['project help requests'] + "," + data['project help replies'] + "," + data['comments'] + "," + data['tips and thanks'] + "," + last_activity_date + "\n")
f.close()
Sample Output from khanscrapetry1.csv
date_joined, points, videos, questions, votes, answers, flags, project_request, project_replies, comments, tips_thx, last_date
6 years ago,1527829,1123,25,100,2,0,NA,NA,0,0,Saturday Jun 4 2016
6 years ago,1527829,1123,25,100,2,0,NA,NA,0,0,Saturday Jun 4 2016
6 years ago,3164708,1276,164,2793,348,67,16,3,5663,885,Wednesday Oct 31 2018
6 years ago,3164708,1276,164,2793,348,67,16,3,5663,885,Wednesday Oct 31 2018
NA,NA,NA,18,NA,0,0,NA,NA,0,NA,Monday Dec 24 2018
NA,NA,NA,18,NA,0,0,NA,NA,0,NA,Monday Dec 24 2018
5 years ago,240334,56,7,42,6,0,2,NA,12,2,Tuesday Nov 20 2018
5 years ago,240334,56,7,42,6,0,2,NA,12,2,Tuesday Nov 20 2018
...

Related

Python Multiprocessing in web crawler

I am trying to implement multiprocessing in my web crawler, what I usually see online is sending the url as args into the function of map or map_async or apply_asyn. The data I am crawling is in the table, thus, I extract them by doing two times beautifulsoup find_all for row and column. Since the data I am crawling sometime is in one page which only require one url. I try to use the return list from Find_all as args for map_async, but the error occur showing "Fatal Python error: Cannot recover from stackoverflow."
The error occurred on the following line
return_list = pool.map_async(func, Species_all_recorded_data_List)
How could I solve it or where should the multiprocessing be implemented will be better?
The second problem is that if I put some code above the function crawl_all_data_mp, when it execute the pool = Pool(), all the code above will execute. I solved it by simply move all the other code under that function. It might not be correct since I still can't really run the code due to the first error.
Looking for your advice
My code:
(1) Function to call for web crawling
from tkinter import filedialog
from tkinter import *
import csv
import os.path
from os import path
from Index import *
from Dragonfly import *
import codecs
from multiprocessing import Process, Value
#\ multiprocessing ver
def multiprocessing_row_data(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page, Species_all_record_data_Data_Set):
global DataCNT, stop_crawl_all_data_mp
tmp_List = Species_all_record_data_Data_Set.find_all('td')
# End condition
# 1.no data in next page
# 2.for update to find unti the old data by inspecting its ID
# 3.if it count over the the limit count
id = tmp_List[0].text
if (len(id) == 0) or (DataCNT >= expecting_CNT)or (DataCNT >= Limit_CNT):
print(' --Finish crawl--' + ' crawl to page: ' + str(page) + ", ID: " + id + ", count: " + str(DataCNT))
stop_crawl_all_data_mp = True
raise StopIteration
# access the same value in memory when doing multiprocessing
with DataCNT.getlock():
DataCNT.value += 1
response_DetailedInfo = session.post(general_url + Detailed_discriptions_url + id, headers=headers)
soup2 = BeautifulSoup(response_DetailedInfo.text, 'html.parser')
print("Current finished datas >> " + str(DataCNT.value) + " /" + str(Total_num) + " (" + str(DataCNT.value * 100 / Total_num) + "%)", end='\r')
return DetailedTableInfo(tmp_List[0].text, tmp_List[1].text, tmp_List[2].text, tmp_List[3].text, tmp_List[4].text, tmp_List[5].text, tmp_List[7].text, tmp_List[6].text,
soup2.find(id='R_LAT').get('value'),
soup2.find(id='R_LNG').get('value'),
Web_rawl_Species_family_name,
Web_rawl_Species_name,
soup2.find(id='R_MEMO').get('value'))
def crawl_all_data_mp(Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID):
page = 0
DataList = []
while not stop_crawl_all_data_mp:
pool = multiprocessing.Pool(10)
Species_all_recorded_data = session.post( general_url +
species_all_record_data_first_url +
species_all_record_data_page_url + str(page) +
species_all_record_data_species_url +
Species_class_key[Web_rawl_Species_family_name] +
Species_key[Web_rawl_Species_name],
headers=headers)
soup = BeautifulSoup(Species_all_recorded_data.text, 'html.parser')
Species_all_recorded_data_List = soup.find_all(id='theRow')
func = partial(multiprocessing_row_data, Web_rawl_Species_family_name, Web_rawl_Species_name, Total_num, Limit_CNT, expecting_CNT, oldID, page)
return_list = pool.map_async(func, Species_all_recorded_data_List)
DataList.append(list(filter(None, return_list.get())))
page += 1
# make sure whe main is finished, subfunctions still keep rolling on
pool.close()
pool.join()
return [DataList, page]
(2) main
it goes wrong on the following line for calling the function above
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
the main code:
# --main--
if __name__ == '__main__':
# setting
Input_species_famliy = "細蟌科"
Input_species = "四斑細蟌"
limit_cnt = 6000
folder = 'Crawl_Data\\' + Species_class_key[Input_species_famliy]
File_name = folder + "\\" + Species_class_key[Input_species_famliy] + Species_key[Input_species] +'.csv'
oldID = 0
oldData_len = 0
print("--Start crawl-- " + Input_species_famliy + " " + Input_species)
print("[folder]: " + folder)
stop_crawl_all_data_mp = False
# check the file exist or not
file_check = path.exists(current_path + "\\" + File_name)
# get the Old ID
if file_check:
file_size = os.stat(current_path + "\\" + File_name).st_size
if not file_size == 0:
with open(File_name, newline='', errors = "ignore") as F:
R = csv.reader(F)
oldData = [line for line in R]
oldID = oldData[0][0]
oldData_len = len(oldData)-1
# login
Login_Web(myaccount, mypassword)
# find the total number of the species_input (expect executed one time)
Species_total_num_Dict = Find_species_total_data()
# get the data
Total_num = int(Species_total_num_Dict[Input_species])
#[datatmpList, page] = crawl_all_data(Input_species_famliy, Input_species, Total_num, limit_cnt, oldID)
expecting_CNT = Total_num - oldData_len # get the total number of data need to be update ot crawl
[datatmpList, page] = crawl_all_data_mp(Input_species_famliy, Input_species, Total_num, limit_cnt, expecting_CNT, oldID)
Data = []
for Data_tmp in datatmpList:
Data.append([Data_tmp.SpeciesFamily,
Data_tmp.Species,
Data_tmp.IdNumber,
Data_tmp.Dates,
Data_tmp.Times,
Data_tmp.User,
Data_tmp.City,
Data_tmp.Dictrict,
Data_tmp.Place,
Data_tmp.Altitude,
Data_tmp.Latitude,
Data_tmp.Longitude,
Data_tmp.Description
])
#auto make the directories
newDir = current_path + "\\" + folder
if (not os.path.isdir(newDir)):
os.mkdir(newDir)
# 'a' stands for append, which can append the new data to old one
with open(File_name, mode='a', newline='', errors = "ignore") as employee_file:
employee_writer = csv.writer(employee_file, delimiter=',', quoting=csv.QUOTE_MINIMAL)
# init , for there is no file exists or the file is empty
if ((not file_check) or (file_size == 0)):
employee_writer.writerow(CSV_Head)
employee_writer.writerows(Data)
# for inserting the data into the old one
else:
for i in range(0, len(Data)):
oldData.insert(i, Data[i])
employee_writer.writerows(oldData)

np.arange returns <Response 200> instead of value

I'm trying to write a script that scrapes the text of multiple webpages with slightly differing URLs. I want to go through the pages with an np.arange function that inserts a string into the URL. But there must be something wrong with the URL the script is composing. In the document, that stores the scraped text, it scrapes just messages like "this site does not exist anymore". The steps I have taken to come closer to the solution are detailed below. Here is my code.
from bs4 import BeautifulSoup
import numpy as np
import datetime
from time import sleep
from random import randint
datum = datetime.datetime.now()
pages = np.arange(1, 20, 1)
datum_jetzt = datum.strftime("%Y") + "-" + datum.strftime("%m") + "-" + datum.strftime("%d")
url = "https://www.shabex.ch/pub/" + datum_jetzt + "/index-"
results = requests.get(url)
file_name = "" + datum.strftime("%Y") + "-" + datum.strftime("%m") + "-" + datum.strftime("%d") + "-index.htm"
for page in pages:
page = requests.get("https://www.shabex.ch/pub/" + datum_jetzt + "/index-" + str(page) + ".htm")
soup = BeautifulSoup(results.text, "html.parser")
texte = soup.get_text()
sleep(randint(2,5))
f = open(file_name, "a")
f.write(texte)
f.close
I found that if I find enter print("https://www.shabex.ch/pub/" + datum_jetzt + "/index-" + str(page) + ".htm") in the console, I get https://www.shabex.ch/pub/2020-05-18/index-<Response [200]>.htm. So the np.arange function returns the response of the webserver instead of the value I seek.
Where have I gone wrong?

How To Scrape Specific Chracter in Selenium using Python

I Want To Scrape 70 character in this HTML code:
<p>2) Proof of payment emailed to satrader03<strong>#gmail.com</strong> direct from online banking 3) Selfie of you holding your ID 4) Selfie of you holding your bank card from which payment will be made OR 5) Skype or what's app Video call while logged onto online banking displaying account name which should match personal verified name Strictly no 3rd party payments</p>
I Want To Know How To Scrape Specific Character with selenium for example i want to scrape 30 character or other
Here is my code:
description = driver.find_elements_by_css_selector("p")
items = len(title)
with open('btc_gmail.csv','a',encoding="utf-8") as s:
for i in range(items):
s.write(str(title[i].text) + ',' + link[i].text + ',' + description[i].text + '\n')
How to scrape 30 characters or 70 or something
Edit (full code):
driver = webdriver.Firefox()
r = randrange(3,7)
for url_p in url_pattren:
time.sleep(3)
url1 = 'https://www.bing.com/search?q=site%3alocalbitcoins.com+%27%40gmail.com%27&qs=n&sp=-1&pq=site%3alocalbitcoins+%27%40gmail.com%27&sc=1-31&sk=&cvid=9547A785CF084BAE94D3F00168283D1D&first=' + str(url_p) + '&FORM=PERE3'
driver.get(url1)
time.sleep(r)
title = driver.find_elements_by_tag_name('h2')
link = driver.find_elements_by_css_selector("cite")
description = driver.find_elements_by_css_selector("p")
items = len(title)
with open('btc_gmail.csv','a',encoding="utf-8") as s:
for i in range(items):
s.write(str(title[i].text) + ',' + link[i].text + ',' + description[i].text[30:70] + '\n')
Any Solution?
You can get text of the tag and then use slice on string
>>> description = driver.find_elements_by_css_selector("p")[0].text
>>> print(description[30:70]) # printed from 30th to 70th symbol
'satrader03<strong>#gmail.com</strong>'

Excluding 'duplicated' scraped URLs in Python app?

I've never used Python before so excuse my lack of knowledge but I'm trying to scrape a xenforo forum for all of the threads. So far so good, except for the fact its picking up multiple URLs for each page of the same thread, I've posted some data before to explain what I mean.
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/page-9
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/page-10
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/page-11
Really, what I would ideally want to scrape is just one of these.
forums/my-first-forum/: threads/my-gap-year-uni-story.13846/
Here is my script:
from bs4 import BeautifulSoup
import requests
def get_source(url):
return requests.get(url).content
def is_forum_link(self):
return self.find('special string') != -1
def fetch_all_links_with_word(url, word):
source = get_source(url)
soup = BeautifulSoup(source, 'lxml')
return soup.select("a[href*=" + word + "]")
main_url = "http://example.com/forum/"
forumLinks = fetch_all_links_with_word(main_url, "forums")
forums = []
for link in forumLinks:
if link.has_attr('href') and link.attrs['href'].find('.rss') == -1:
forums.append(link.attrs['href']);
print('Fetched ' + str(len(forums)) + ' forums')
threads = {}
for link in forums:
threadLinks = fetch_all_links_with_word(main_url + link, "threads")
for threadLink in threadLinks:
print(link + ': ' + threadLink.attrs['href'])
threads[link] = threadLink
print('Fetched ' + str(len(threads)) + ' threads')
This solution assumes that what should be removed from the url to check for uniqueness is always going to be "/page-#...". If that is not the case this solution will not work.
Instead of using a list to store your urls you can use a set, which will only add unique values. Then in the url remove the last instance of "page" and anything that comes after it if it is in the format of "/page-#", where # is any number, before adding it to the set.
forums = set()
for link in forumLinks:
if link.has_attr('href') and link.attrs['href'].find('.rss') == -1:
url = link.attrs['href']
position = url.rfind('/page-')
if position > 0 and url[position + 6:position + 7].isdigit():
url = url[:position + 1]
forums.add(url);

Using beautifulsoup to get prices from craigslist

I am new to coding in python (maybe a couple of days in) and basically learning of other people's code on stackoverflow. The code I am trying to write uses beautifulsoup to get the pid and the corresponding price for motorcycles on craigslist. I know there are many other ways of doing this but my current code looks like this:
from bs4 import BeautifulSoup
from urllib2 import urlopen
u = ""
count = 0
while (count < 9):
site = "http://sfbay.craigslist.org/mca/" + str(u)
html = urlopen(site)
soup = BeautifulSoup(html)
postings = soup('p',{"class":"row"})
f = open("pid.txt", "a")
for post in postings:
x = post.getText()
y = post['data-pid']
prices = post.findAll("span", {"class":"itempp"})
if prices == "":
w = 0
else:
z = str(prices)
z = z[:-8]
w = z[24:]
filewrite = str(count) + " " + str(y) + " " +str(w) + '\n'
print y
print w
f.write(filewrite)
count = count + 1
index = 100 * count
print "index is" + str(index)
u = "index" + str(index) + ".html"
It works fine and as I keep learning i plan to optimize it. The problem I have right now, is that entries without price are still showing up. Is there something obvious that I am missing.
thanks.
The problem is how you're comparing prices. You say:
prices = post.findAll("span", {"class":"itempp"})
In BS .findAll returns a list of elements. When you're comparing price to an empty string, it will always return false.
>>>[] == ""
False
Change if prices == "": to if prices == [] and everything should be fine.
I hope this helps.

Categories

Resources