youtube scraper requests & urllib.parse - python

trying to get sub count for each video channel I get but don't know why it's not working
class YoutubeSearch:
def __init__(self, search_terms: str, max_results=None):
self.search_terms = search_terms
self.max_results = max_results
self.videos = self._search()
def _search(self):
encoded_search = urllib.parse.quote_plus(self.search_terms)
BASE_URL = "https://youtube.com"
url = f"{BASE_URL}/results?search_query={encoded_search}"
response = requests.get(url).text
while "ytInitialData" not in response:
response = requests.get(url).text
results = self._parse_html(response)
if self.max_results is not None and len(results) > self.max_results:
return results[: self.max_results]
return results
def _parse_html(self, response):
results = []
start = (
response.index("ytInitialData")
+ len("ytInitialData")
+ 3
)
end = response.index("};", start) + 1
json_str = response[start:end]
data = json.loads(json_str)
videos = data["contents"]["twoColumnSearchResultsRenderer"]["primaryContents"][
"sectionListRenderer"
]["contents"][0]["itemSectionRenderer"]["contents"]
# get subscriberCountText from each video
for video in videos:
res = {}
if "videoRenderer" in video.keys():
video_data = video.get("videoRenderer", {})
#res["id"] = video_data.get("videoId", None)
res["title"] = video_data.get("title", {}).get("runs", [[{}]])[0].get("text", None)
#res["channel"] = video_data.get("longBylineText", {}).get("runs", [[{}]])[0].get("text", None)
res["url_suffix"] = video_data.get("navigationEndpoint", {}).get("commandMetadata", {}).get("webCommandMetadata", {}).get("url", None)
res["subscribers"] = video_data.get("subscriberCountText", {}).get("simpleText", 0)
results.append(res)
return results
def to_json(self, clear_cache=True):
result = json.dumps({"videos": self.videos})
if clear_cache:
self.videos = ""
return result
Everthing works except sub count and I know it's semantic error
in
res["subscribers"] = video_data.get("subscriberCountText", {}).get("simpleText", 0)
for subscriberCountText i think i should get each channel data i get but how?

Related

None of [([ ])] are in the columns

I keep getting the below keyerror and can't figure out what it means or what I should be doing different.
KeyError: "None of [Index(['team totals', 'mp_max', 'fg_max', 'fga_max', 'fg%_max', '3p_max',\n '3pa_max', '3p%_max', 'ft_max', 'fta_max', 'ft%_max', 'orb_max',\n 'drb_max', 'trb_max', 'ast_max', 'stl_max', 'blk_max', 'tov_max',\n 'pf_max', 'pts_max', '+/-_max', 'ts%_max', 'efg%_max', '3par_max',\n 'ftr_max', 'orb%_max', 'drb%_max', 'trb%_max', 'ast%_max', 'stl%_max',\n 'blk%_max', 'tov%_max', 'usg%_max', 'ortg_max', 'drtg_max'],\n dtype='object')] are in the [columns]"
my code is
from bs4 import BeautifulSoup
import pandas
import os
SEASONS = list(range(2016, 2017))
DATA_DIR = "data"
STANDINGS_DIR = os.path.join(DATA_DIR, "standings")
SCORES_DIR = os.path.join(DATA_DIR, "scores")
box_scores = os.listdir(SCORES_DIR)
box_scores = [os.path.join(SCORES_DIR, f) for f in box_scores if f.endswith(".html")]
def parse_html(box_score):
with open(box_score) as f:
html = f.read()
soup = BeautifulSoup(html)
[s.decompose() for s in soup.select("tr.over_header")] # this removes the tr tag with class over_header from the html
[s.decompose() for s in soup.select("tr.thead")]
return soup
def read_line_score(soup):
line_score = pandas.read_html(str(soup), attrs = {"id": "line_score"})[0]
cols = list(line_score.columns)
cols[0] = "team"
cols[-1] = "total"
line_score.columns = cols
line_score = line_score[["team", "total"]]
return line_score
def read_stats(soup, team, stat):
df = pandas.read_html(str(soup), attrs={"id": f"box-{team}-game-{stat}"}, index_col=0)[0]
df = df.apply(pandas.to_numeric, errors="coerce")
return df
def read_season_info(soup):
nav = soup.select("#bottom_nav_container")[0]
hrefs = [a["href"] for a in nav.find_all("a")]
season = os.path.basename(hrefs[1]).split("_")[0]
return season
base_cols = None
games = []
for box_score in box_scores:
soup = parse_html(box_score)
line_score = read_line_score(soup)
teams = list(line_score["team"]) #grabs just the teams who played each other
summaries = []
for team in teams:
basic = read_stats(soup, team, "basic")
advanced = read_stats(soup, team, "advanced")
totals = pandas.concat([basic.iloc[-1:], advanced.iloc[-1:]])
totals.index = totals.index.str.lower() # to lower case
maxes = pandas.concat([basic.iloc[:-1,:].max(), advanced.iloc[:-1,:].max()])
maxes.index = maxes.index.str.lower() + "_max"
summary = pandas.concat([totals, maxes])
if base_cols is None:
base_cols = list(summary.index.drop_duplicates(keep="first"))
base_cols = [b for b in base_cols if "bpm" not in b]
summary - summary[base_cols]
summaries.append(summary)
summary = pandas.concat(summaries, asix=1).T
game = pandas.concat([summary, line_score], axis=1)
game["home"] = [0, 1]
game_opp = game.iloc[::-1].reset_index()
game_opp.columns += "_opp"
full_game = pandas.concat([game, game_opp], axis=1)
full_game["season"] = read_season_info("soup")
full_game["date"] = os.path.basename(box_score)[:8]
full_game["date"] = pandas.to_datetime(full_game["date"], format="%Y%m%d")
full_game["won"] = full_game["total"] > full_game["total_opp"]
games.append(full_game)
if len(games) % 100 == 0:
print(f"{len(games)} / {len(box_scores)}")

Web Scrapper for Job Search Platform not Scrapping Content

I am trying to scrape data from a job searching platform called Job Street. The web crawler works but the generated csv file is empty with no data. The expected output should be a list of jobs with the job title, description, etc.
Below is my code. I am performing this task using selenium. I would really appreciate the help. Thank you in advanced.
headers = {'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) (KHTML, like Gecko) Chrome/102.0.5005.61 '}
path = "/Users/Downloads/jobstreet-scraper-main/chromedriver"
driver = Chrome(executable_path=path)
time.sleep(2)
base_url = "https://www.jobstreet.com.my/en/job-search/{}-jobs/{}/"
def get_page_number(keyword):
#input: keyword for job_postings
#output: number of pages
url = base_url.format(keyword, 1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
result_text = soup.find("span",{"class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _1d0g9qk4 _18qlyvc8"})
results = result_text.text.split()
result = result_text.text.split()[-2]
result1 = result.replace(',','')
result2 = int(result1)
page_number = math.ceil(result2/30)
return page_number
def job_page_scraper(link):
url = "https://www.jobstreet.com.my"+link
print("scraping...", url)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
scripts = soup.find_all("script")
for script in scripts:
if script.contents:
txt = script.contents[0].strip()
if 'window.REDUX_STATE = ' in txt:
jsonStr = script.contents[0].strip()
jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
jsonStr = jsonStr.split('}}}};')[0].strip()
jsonStr = jsonStr+"}}}}"
jsonObj = json.loads(jsonStr)
job = jsonObj['details']
job_id = job['id']
job_expired = job['isExpired']
job_confidential = job['isConfidential']
job_salary_min = job['header']['salary']['min']
job_salary_max = job['header']['salary']['max']
job_salary_currency = job['header']['salary']['currency']
job_title = job['header']['jobTitle']
company = job['header']['company']['name']
job_post_date = job['header']['postedDate']
job_internship = job['header']['isInternship']
company_website = job['companyDetail']['companyWebsite']
company_avgProcessTime = job['companyDetail']['companySnapshot']['avgProcessTime']
company_registrationNo = job['companyDetail']['companySnapshot']['registrationNo']
company_workingHours = job['companyDetail']['companySnapshot']['workingHours']
company_facebook = job['companyDetail']['companySnapshot']['facebook']
company_size = job['companyDetail']['companySnapshot']['size']
company_dressCode = job['companyDetail']['companySnapshot']['dressCode']
company_nearbyLocations = job['companyDetail']['companySnapshot']['nearbyLocations']
company_overview = job['companyDetail']['companyOverview']['html']
job_description = job['jobDetail']['jobDescription']['html']
job_summary = job['jobDetail']['summary']
job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
job_requirement_fieldOfStudy = job['jobDetail']['jobRequirement']['fieldOfStudy']
#job_requirement_industry = job['jobDetail']['jobRequirement']['industryValue']['label']
job_requirement_skill = job['jobDetail']['jobRequirement']['skills']
job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
job_languages = job['jobDetail']['jobRequirement']['languages']
job_benefits = job['jobDetail']['jobRequirement']['benefits']
job_apply_url = job['applyUrl']['url']
job_location_zipcode = job['location'][0]['locationId']
job_location = job['location'][0]['location']
job_country = job['sourceCountry']
return [job_id, job_title, job_expired, job_confidential, job_salary_max, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_website, company_avgProcessTime, company_registrationNo, company_workingHours, company_facebook, company_size, company_dressCode, company_nearbyLocations, company_overview, job_description, job_summary, job_requirement_career_level, job_requirement_fieldOfStudy, job_requirement_yearsOfExperience, job_requirement_qualification, job_requirement_skill, job_employment_type, job_languages, job_benefits, job_apply_url, job_location_zipcode, job_location, job_country]
def page_crawler(keyword):
# input: keyword for job postings
# output: dataframe of links scraped from each page
# page number
page_number = get_page_number(keyword)
job_links = []
for n in range(page_number):
print('Loading page {} ...'.format(n+1))
url = base_url.format(keyword, n+1)
driver.get(url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#extract all job links
links = soup.find_all('a',{'class':'sx2jih0'})
job_links += links
jobs = []
for link in job_links:
job_link = link['href'].strip().split('?', 1)[0]
jobs.append([keyword, job_link] + job_page_scraper(job_link))
result_df = pd.DataFrame(jobs, columns = ['keyword', 'link', 'job_id', 'job_title', 'job_expired', 'job_confidential', 'job_salary_max', 'job_salary_max', 'job_salary_currency', 'company', 'job_post_date', 'job_internship', 'company_website', 'company_avgProcessTime', 'company_registrationNo', 'company_workingHours', 'company_facebook', 'company_size', 'company_dressCode', 'company_nearbyLocations', 'company_overview', 'job_description', 'job_summary', 'job_requirement_career_level', 'job_requirement_fieldOfStudy', 'job_requirement_yearsOfExperience', 'job_requirement_qualification', 'job_requirement_skill', 'job_employment_type', 'job_languages', 'job_benefits', 'job_apply_url', 'job_location_zipcode', 'job_location', 'job_country'])
return result_df
def main():
# a list of job roles to be crawled
key_words = ['medical doctor']
dfs = []
for key in key_words:
key_df = page_crawler(key)
dfs.append(key_df)
# save scraped information as csv
pd.concat(dfs).to_csv("job_postings_results.csv")
if __name__ == '__main__':
main()

how to dump a json dict completely? (so far just dumpt last page )

I couldn't dump the json dict completly
I just could dump the json dict last page. Help me,please.
the coding showed below:
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text,'lxml')
jobs = objSoup.find_all('article',class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:",cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:",job_name)
d = [('公司名稱',cust_name),('職務名稱', job_name)]
j_dict = dict(d)
job_list.append(j_dict)
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H+str(page+1)+url_T
job_list(url)
print('-'*70)
time.sleep(random.randint(3,5))
myjob = {'Job':job_list}
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob,fnObj,indent=2,ensure_ascii=False)
Try this code
jobs_to_dump = [] #### added
def job_list(url):
htmlFile = requests.get(url)
objSoup = bs4.BeautifulSoup(htmlFile.text, 'lxml')
jobs = objSoup.find_all('article', class_='js-job-item')
job_list = []
for job in jobs:
cust_name = job.get('data-cust-name')
print("公司名稱:", cust_name)
job_name = job.get('data-job-name')
print("職稱名稱:", job_name)
d = [('公司名稱', cust_name), ('職務名稱', job_name)]
j_dict = dict(d)
jobs_to_dump.append(j_dict) ###modified
url_H = 'https://www.104.com.tw/jobs/search/?ro=0&kwop=7&keyword=藥師&order=1&asc=0&page='
url_T = '&mode=s&jobsource=2021indexpoc'
page_total = 2
for page in range(page_total):
url = url_H + str(page + 1) + url_T
job_list(url)
print('-' * 70)
time.sleep(random.randint(3, 5))
myjob = {'Job': jobs_to_dump} #### modified
fn = '104爬蟲.json'
with open(fn, "w") as fnObj:
json.dump(myjob, fnObj, indent=2, ensure_ascii=False)

unable to resolve " line 93, in <module> run() and line 89, in run anuncios.extend(anuncios_da_pagina) TypeError: 'NoneType' object not iterable"

from converter import gif_to_png, image_to_text
from file_helper import dictionary_list_to_csv
from util import get_site_html, get_bsobj_from
def get_anuncio(url_anuncio):
print("Buscando " + url_anuncio)
anuncio = {"url": url_anuncio}
html_anuncio = get_site_html(url_anuncio)
if html_anuncio is None:
return None
obj_anuncio = get_bsobj_from(html_anuncio)
if obj_anuncio is None:
return None
span_visible_phone = obj_anuncio.find(id="visible_phone")
span_codigo_do_anuncio = obj_anuncio.find("span", {"class": "sc-gqjmRU"})
codigo_do_anuncio = span_codigo_do_anuncio.get_text()
#print("Código do anúncio: " + codigo_do_anuncio)
anuncio["codigo"] = codigo_do_anuncio
phone = "Desconhecido"
if (span_visible_phone):
imgurl = span_visible_phone.img['src']
img = get_site_html(imgurl)
if img is None:
return None
gif_name = "images/" + codigo_do_anuncio + '.gif'
localFile = open(gif_name, 'wb')
localFile.write(img.read())
localFile.close()
gif_to_png(gif_name)
phone = image_to_text(gif_name + '.png')
anuncio["phone"] = phone
return anuncio
def get_anuncios(url):
html = get_site_html(url)
if html is None:
return None
bsObj = get_bsobj_from(html)
if bsObj is None:
return None
try:
links_for_anuncios = bsObj.findAll("a", {"class": "OLXad-list-link"})
except AttributeError as e:
print("Erro ao obter lista de anúncios")
print(e)
return None
anuncios = []
for link_anuncio in links_for_anuncios:
anuncio = get_anuncio(link_anuncio['href'])
anuncios.append(anuncio)
print(anuncio)
print(" ")
return anuncios
def run():
url = ""
while url == "":
url = input("Informe a URL desejada: ")
numero_de_paginas = input("Informe a quantidade de páginas: ")
if numero_de_paginas == "":
numero_de_paginas = 1
else:
numero_de_paginas = int(numero_de_paginas)
anuncios = []
for page in range(numero_de_paginas):
pagina_atual = page + 1
print("Obtendo anuncios da pagina " + str(pagina_atual))
url_formatada = url
if pagina_atual > 1:
url_formatada += "&o=" + str(pagina_atual)
anuncios_da_pagina = get_anuncios(url_formatada)
anuncios.extend(anuncios_da_pagina)
dictionary_list_to_csv(anuncios)
run()

How to fixing scrapy json row to multiple json file

I have created a scrapy crawler to export individual item to a folder called out but I got 58 items from crawler but not getting 58 files. We just found 50 files.
Currently, I am using windows 10 and python 3
# -*- coding: utf-8 -*-
import json
import os
import random
from scrapy import Spider
from scrapy.http import Request
class AndroiddeviceSpider(Spider):
name = 'androiddevice'
allowed_domains = ['androiddevice.info']
start_urls = ['']
def __init__(self,sr_term):
self.start_urls=['https://www.androiddevice.info/devices?search='+sr_term]
def parse(self, response):
print (response.url)
print ('\n')
listings = response.css('a:nth-of-type(2)::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
sum_meta = link.split('/')[-1]
yield Request(ac_link, meta={"sum_meta":sum_meta}, callback=self.parse_p)
# yield scrapy.Request(ac_link, callback=self.parse_p)
# checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
# if checking_last:
# checking_last = checking_last.split('?page=')[-1].split('&')[0]
# ran_ = int(checking_last)+1
# if int(checking_last) is not 1:
# for i in range(2, ran_):
# next_p = 'https://www.androiddevice.info/devices?page={}&search=samsung'.format(i)
# n_link = next_p
# yield Request(n_link, callback=self.parse)
def parse_p(self, response):
sum_meta = response.meta['sum_meta']
r = response.url
r = r.split('/')[-2]
sum_meta = r
listings = response.css('th a::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
yield Request(ac_link, callback=self.parse_details)
checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
if checking_last:
checking_last = checking_last.split('?page=')[-1].split('&')[0]
ran_ = int(checking_last)+1
if int(checking_last) is not 1:
for i in range(2, ran_):
# next_p = 'https://www.androiddevice.info/devices?page={}&search=samsung'.format(i)
next_p = 'https://www.androiddevice.info/submissions/{}'+'?page={}'.format(sum_meta,i)
n_link = next_p
yield Request(n_link, callback=self.parse_p)
def parse_details(self, response):
url = response.url
print (url)
print ('\n')
item = {}
items = item
timezone_olson_random = [
"America/Indiana/Knox",
"America/Denver",
"America/Kentucky/Monticello",
"America/Detroit",
"America/Indiana/Petersburg",
"America/New_York",
"America/Chicago",
"America/Kentucky/Louisville",
"America/Los_Angeles",
"America/Indianapolis",
]
java_vm_version = response.xpath('//tr//th[contains(text(),"java_vm_version")]//following-sibling::th//pre//text()').extract_first()
ro_product_provider = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_brand = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_name = response.xpath('//tr//th[contains(text(),"ro.product.name")]//following-sibling::th//pre//text()').extract_first()
ro_product_model = response.xpath('//tr//th[contains(text(),"ro.product.model")]//following-sibling::th//pre//text()').extract_first()
ro_product_board = response.xpath('//tr//th[contains(text(),"ro.product.board")]//following-sibling::th//pre//text()').extract_first()
ro_build_id = response.xpath('//tr//th[contains(text(),"ro_build_id")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_incremental = response.xpath('//tr//th[contains(text(),"ro_build_version_incremental")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_release = response.xpath('//tr//th[contains(text(),"ro_build_version_release")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_sdk = response.xpath('//tr//th[contains(text(),"ro_build_version_sdk")]//following-sibling::th//pre//text()').extract_first()
timezone_olson = random.choice(timezone_olson_random)
item['java_vm_version'] = java_vm_version
item['ro_product_provider'] = ro_product_provider
item['ro_product_brand'] = ro_product_brand
item['ro_product_name'] = ro_product_name
item['ro_product_model'] = ro_product_model
item['ro_product_board'] = ro_product_board
item['ro_build_id'] = ro_build_id
item['ro_build_version_incremental'] = ro_build_version_incremental
item['ro_build_version_release'] = ro_build_version_release
item['ro_build_version_sdk'] = ro_build_version_sdk
item['timezone_olson'] = timezone_olson
formatted_json = json.dumps(items, indent = 4,sort_keys=True)
with open(os.path.join('out', ro_product_model+".json"), "w") as f:
f.write(formatted_json)
yield item
I expect the output files would be 58 items to 58 .json files into my out folder
Thank you,
Palash

Categories

Resources