python issue to join relative url to absolute url for img - python

I'm facing the following issues with my current code to make it work. I just concatenate the URL but its not working:
Current relative path (this is what I get with normal response.xpath crawl):
This is my current code:
class MercadoSpider(CrawlSpider):
name = 'extractor'
item_count = 0
rules = {
# Para cada item
Rule(LinkExtractor(allow = (), restrict_xpaths = ('//*[#id="main-container"]/div/div[2]/div[1]/ul/li[7]/a'))),
Rule(LinkExtractor(allow =(), restrict_xpaths = ('//*[#id="main-container"]/div/div[2]/div[2]/div/div/div/h4/a')),
callback = 'parse_item', follow = False)
def parse_item(self, response):
ml_item = MercadoItem()
ml_item['titulo'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/h2)').extract_first()
ml_item['sku'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[2]/a)').extract()
ml_item['marca'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/ul/li[1]/a)').extract()
ml_item['tecnologia'] = response.xpath('normalize-space(//*[#id="DetailedSpecs"]/table/tbody/tr[4]/td)').extract_first()
ml_item['tipo'] = response.xpath('normalize-space(//*[#id="DetailedSpecs"]/table/tbody/tr[3]/td)').extract()
ml_item['precio'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[1]/div[2]/div[1]/span[2])').extract()
ml_item['color'] = response.xpath('normalize-space(//*[#id="mainC"]/div/div/div/div/ul/li/b)').extract()
ml_item['potencia'] = response.xpath('normalize-space(//*[#id="ProductReview"]/div/div/div/dl/dd/strong)').extract()
ml_item['condicion'] = response.xpath('normalize-space(//*[#class="stock in-stock"])').extract_first()
ml_item['desc_corta'] = response.xpath('normalize-space(//*[#id="tab-additional_information"])').extract()
ml_item['descripcion'] = response.xpath('normalize-space(//*[#id="main-container"]/div/div[2]/div[2]/div)').extract()
ml_item['id_publicacion'] = response.xpath('normalize-space(//*[#id="mainC"]/div/div/div[11]/div[1]/ul/li[1]/b)').extract()
#imagenes del producto
xpath1 = ''
xpath2 = response.xpath('//*[#id="main-container"]/div/div[2]/div[1]/div[1]/p/img/#src').extract_first()
ml_item['image_urls'] = xpath1 + xpath2
ml_item['image_name'] = response.xpath('//*[#id="main-container"]/div/div[2]/div[1]/div[1]/p/img/#src').extract()
#info de la tienda o vendedor
ml_item['categoria'] = response.xpath('normalize-space(//*[#class="woocommerce-breadcrumb breadcrumbs"])').extract_first()
self.item_count += 1
if self.item_count > 10000:
raise CloseSpider('item_exceeded')
yield ml_item

absolute_url = response.urljoin(your_url_from_xpath)
scrapy documentation


Web Scrapper for Job Search Platform not Scrapping Content

I am trying to scrape data from a job searching platform called Job Street. The web crawler works but the generated csv file is empty with no data. The expected output should be a list of jobs with the job title, description, etc.
Below is my code. I am performing this task using selenium. I would really appreciate the help. Thank you in advanced.
headers = {'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) (KHTML, like Gecko) Chrome/102.0.5005.61 '}
path = "/Users/Downloads/jobstreet-scraper-main/chromedriver"
driver = Chrome(executable_path=path)
base_url = "{}-jobs/{}/"
def get_page_number(keyword):
#input: keyword for job_postings
#output: number of pages
url = base_url.format(keyword, 1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
result_text = soup.find("span",{"class": "sx2jih0 zcydq84u _18qlyvc0 _18qlyvc1x _18qlyvc1 _1d0g9qk4 _18qlyvc8"})
results = result_text.text.split()
result = result_text.text.split()[-2]
result1 = result.replace(',','')
result2 = int(result1)
page_number = math.ceil(result2/30)
return page_number
def job_page_scraper(link):
url = ""+link
print("scraping...", url)
soup = BeautifulSoup(driver.page_source, 'html.parser')
scripts = soup.find_all("script")
for script in scripts:
if script.contents:
txt = script.contents[0].strip()
if 'window.REDUX_STATE = ' in txt:
jsonStr = script.contents[0].strip()
jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
jsonStr = jsonStr.split('}}}};')[0].strip()
jsonStr = jsonStr+"}}}}"
jsonObj = json.loads(jsonStr)
job = jsonObj['details']
job_id = job['id']
job_expired = job['isExpired']
job_confidential = job['isConfidential']
job_salary_min = job['header']['salary']['min']
job_salary_max = job['header']['salary']['max']
job_salary_currency = job['header']['salary']['currency']
job_title = job['header']['jobTitle']
company = job['header']['company']['name']
job_post_date = job['header']['postedDate']
job_internship = job['header']['isInternship']
company_website = job['companyDetail']['companyWebsite']
company_avgProcessTime = job['companyDetail']['companySnapshot']['avgProcessTime']
company_registrationNo = job['companyDetail']['companySnapshot']['registrationNo']
company_workingHours = job['companyDetail']['companySnapshot']['workingHours']
company_facebook = job['companyDetail']['companySnapshot']['facebook']
company_size = job['companyDetail']['companySnapshot']['size']
company_dressCode = job['companyDetail']['companySnapshot']['dressCode']
company_nearbyLocations = job['companyDetail']['companySnapshot']['nearbyLocations']
company_overview = job['companyDetail']['companyOverview']['html']
job_description = job['jobDetail']['jobDescription']['html']
job_summary = job['jobDetail']['summary']
job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
job_requirement_fieldOfStudy = job['jobDetail']['jobRequirement']['fieldOfStudy']
#job_requirement_industry = job['jobDetail']['jobRequirement']['industryValue']['label']
job_requirement_skill = job['jobDetail']['jobRequirement']['skills']
job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
job_languages = job['jobDetail']['jobRequirement']['languages']
job_benefits = job['jobDetail']['jobRequirement']['benefits']
job_apply_url = job['applyUrl']['url']
job_location_zipcode = job['location'][0]['locationId']
job_location = job['location'][0]['location']
job_country = job['sourceCountry']
return [job_id, job_title, job_expired, job_confidential, job_salary_max, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_website, company_avgProcessTime, company_registrationNo, company_workingHours, company_facebook, company_size, company_dressCode, company_nearbyLocations, company_overview, job_description, job_summary, job_requirement_career_level, job_requirement_fieldOfStudy, job_requirement_yearsOfExperience, job_requirement_qualification, job_requirement_skill, job_employment_type, job_languages, job_benefits, job_apply_url, job_location_zipcode, job_location, job_country]
def page_crawler(keyword):
# input: keyword for job postings
# output: dataframe of links scraped from each page
# page number
page_number = get_page_number(keyword)
job_links = []
for n in range(page_number):
print('Loading page {} ...'.format(n+1))
url = base_url.format(keyword, n+1)
soup = BeautifulSoup(driver.page_source, 'html.parser')
#extract all job links
links = soup.find_all('a',{'class':'sx2jih0'})
job_links += links
jobs = []
for link in job_links:
job_link = link['href'].strip().split('?', 1)[0]
jobs.append([keyword, job_link] + job_page_scraper(job_link))
result_df = pd.DataFrame(jobs, columns = ['keyword', 'link', 'job_id', 'job_title', 'job_expired', 'job_confidential', 'job_salary_max', 'job_salary_max', 'job_salary_currency', 'company', 'job_post_date', 'job_internship', 'company_website', 'company_avgProcessTime', 'company_registrationNo', 'company_workingHours', 'company_facebook', 'company_size', 'company_dressCode', 'company_nearbyLocations', 'company_overview', 'job_description', 'job_summary', 'job_requirement_career_level', 'job_requirement_fieldOfStudy', 'job_requirement_yearsOfExperience', 'job_requirement_qualification', 'job_requirement_skill', 'job_employment_type', 'job_languages', 'job_benefits', 'job_apply_url', 'job_location_zipcode', 'job_location', 'job_country'])
return result_df
def main():
# a list of job roles to be crawled
key_words = ['medical doctor']
dfs = []
for key in key_words:
key_df = page_crawler(key)
# save scraped information as csv
if __name__ == '__main__':

unable to resolve " line 93, in <module> run() and line 89, in run anuncios.extend(anuncios_da_pagina) TypeError: 'NoneType' object not iterable"

from converter import gif_to_png, image_to_text
from file_helper import dictionary_list_to_csv
from util import get_site_html, get_bsobj_from
def get_anuncio(url_anuncio):
print("Buscando " + url_anuncio)
anuncio = {"url": url_anuncio}
html_anuncio = get_site_html(url_anuncio)
if html_anuncio is None:
return None
obj_anuncio = get_bsobj_from(html_anuncio)
if obj_anuncio is None:
return None
span_visible_phone = obj_anuncio.find(id="visible_phone")
span_codigo_do_anuncio = obj_anuncio.find("span", {"class": "sc-gqjmRU"})
codigo_do_anuncio = span_codigo_do_anuncio.get_text()
#print("Código do anúncio: " + codigo_do_anuncio)
anuncio["codigo"] = codigo_do_anuncio
phone = "Desconhecido"
if (span_visible_phone):
imgurl = span_visible_phone.img['src']
img = get_site_html(imgurl)
if img is None:
return None
gif_name = "images/" + codigo_do_anuncio + '.gif'
localFile = open(gif_name, 'wb')
phone = image_to_text(gif_name + '.png')
anuncio["phone"] = phone
return anuncio
def get_anuncios(url):
html = get_site_html(url)
if html is None:
return None
bsObj = get_bsobj_from(html)
if bsObj is None:
return None
links_for_anuncios = bsObj.findAll("a", {"class": "OLXad-list-link"})
except AttributeError as e:
print("Erro ao obter lista de anúncios")
return None
anuncios = []
for link_anuncio in links_for_anuncios:
anuncio = get_anuncio(link_anuncio['href'])
print(" ")
return anuncios
def run():
url = ""
while url == "":
url = input("Informe a URL desejada: ")
numero_de_paginas = input("Informe a quantidade de páginas: ")
if numero_de_paginas == "":
numero_de_paginas = 1
numero_de_paginas = int(numero_de_paginas)
anuncios = []
for page in range(numero_de_paginas):
pagina_atual = page + 1
print("Obtendo anuncios da pagina " + str(pagina_atual))
url_formatada = url
if pagina_atual > 1:
url_formatada += "&o=" + str(pagina_atual)
anuncios_da_pagina = get_anuncios(url_formatada)

Python Scrapy Spider Not Following Correct Link

I am trying to scrape the data off of this post. I am having an issue with scraping the comments however. The pagination of the comments is determined by the "page=1" at the end of the url. I noticed that if "page=0" is used it loads all the comments on one page which is really nice. However, my scrapy script will only scrape the comments from the first page, no matter what. Even if I change the link to "page=2" it still will only scrape the comments from the first page. I can not figure out why this issue is occurring.
import scrapy
from scrapy.crawler import CrawlerProcess
class IdeaSpider(scrapy.Spider):
name = "IdeaSpider"
def start_requests(self):
yield scrapy.Request(
"-the-bottom-of-the-queue?page=0", callback=self.parse_idea)
# parses title, post, status, author, date
def parse_idea(self, response):
post_author = response.xpath('//span[#class = "username-content"]/text()')
post_categories = response.xpath('//a[#class = "list-tags-item ng-star-inserted"]/text()')
post_categories_ext = post_categories.extract()
if len(post_categories_ext) > 1:
post_categories_combined = ""
for category in post_categories_ext:
post_categories_combined = post_categories_combined + category + ", "
post_date = response.xpath('//div[#class = "time-date"]/text()')
post_title = response.xpath('//h1[#class = "title"]/text()')
post_body = response.xpath('//article[#class = "post-list-item clearfix ng-star-inserted"]//div[#class = '
'"post-list-item-message-content post-content ng-star-inserted"]//text()')
post_body_ext = post_body.extract()
if len(post_body_ext) > 1:
post_body_combined = ""
for text in post_body_ext:
post_body_combined = post_body_combined + " " + text
post_status = response.xpath('//p[#class = "status-title"][1]/text()')
if len(post_status.extract()) != 0:
temp_list.append("no status")
dev_name = response.xpath('//div[#class = "ideas-details-status-comment user-role u-bdcolor-2 dev"]//p[#class '
'= "username user-role-username"]/text()')
dev_comment = response.xpath('//div[#class = "message post-content ng-star-inserted"]/p/text()')
c_author_index = 0
c_body_index = 0
c_author_path = response.xpath('//article[#class = "post-list-item clearfix two-columns '
'ng-star-inserted"]//span[#class = "username-content"]/text()')
while c_author_index < len(c_author_path):
comment_author = c_author_path[c_author_index]
c_author_index += 1
c_body_combined = ""
c_body_path = '//div[#class = "post-list-comments"]/g2g-comments-item[1]/article[#class = ' \
'"post-list-item clearfix two-columns ng-star-inserted"]/div/div//div[#class ' \
'="post-list-item-message-content post-content ng-star-inserted"]//text() '
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
c_body_index += 1
elif len(c_body_list) != 0:
c_body_index += 1
elif len(c_body_list) == 0:
c_body_index += 1
c_body = response.xpath(c_body_path.replace("1", str(c_body_index + 1)))
c_body_list = c_body.extract()
if len(c_body_list) > 1:
for word in c_body_list:
c_body_combined = c_body_combined + " " + word
c_body_index += 1
temp_list = list()
all_post_data = list()
process = CrawlerProcess()
This is because the comment pages are loaded using JavaScript and Scrapy is not rendering JavaScript. You could use Splash.

Python requests.get() loop returns nothing

When trying to scrape multiple pages of this website, I get no content in return. I usually check to make sure all the lists I'm creating are of equal length, but all are coming back as len = 0.
I've used similar code to scrape other websites, so why does this code not work correctly?
Some solutions I've tried, but haven't worked for my purposes: requests.Session() solutions as suggested in this answer, .json as suggested here.
for page in range(100, 350):
page = requests.get("" + str(page) + "&res=pm")
page.encoding = page.apparent_encoding
if not page:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('tbody')
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
if cand is not None:
can2 = info.get_text()
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
if partyn is not None:
partyn2 = data.get_text()
votec = container.find_all('td', class_='votes')
for data in votec:
if votec is not None:
votec2 = data.get_text()
cansh = container.find_all('td', class_='percent')
for data in cansh:
if cansh is not None:
cansh2 = data.get_text()
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
if fvote is not None:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
Thanks in advance for your help!
I've made some simplification changes. The major changes that needed to be changed were:
ghana_tbody = soup.find_all('table', class_='canResults')
can2 = info # not info.get_text()
I have only tested this against page 112; life is too short.
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep
can = []
pty_n = []
cv1 = []
cvs1 = []
vot1 = []
END_PAGE = 112
for page in range(START_PAGE, END_PAGE + 1):
page = requests.get("")
page.encoding = page.apparent_encoding
if not page:
soup = BeautifulSoup(page.text, 'html.parser')
ghana_tbody = soup.find_all('table', class_='canResults')
for container in ghana_tbody:
#### CANDIDATES ####
candidate = container.find_all('div', class_='can par')
for data in candidate:
cand = data.find('h4')
for info in cand:
can2 = info # not info.get_text()
#### PARTY NAMES ####
partyn = container.find_all('h5')
for data in partyn:
partyn2 = data.get_text()
votec = container.find_all('td', class_='votes')
for data in votec:
votec2 = data.get_text()
cansh = container.find_all('td', class_='percent')
for data in cansh:
cansh2 = data.get_text()
#### TOTAL VOTES ####`
tfoot = soup.find_all('tr', class_='total')
for footer in tfoot:
fvote = footer.find_all('td', class_='votes')
for data in fvote:
fvote2 = data.get_text()
fvoteindiv = [fvote2]
fvotelist = fvoteindiv * (len(pty_n) - len(vot1))
print('can = ', can)
print('pty_n = ', pty_n)
print('cv1 = ', cv1)
print('cvs1 = ', cvs1)
print('vot1 = ', vot1)
can = ['Kwadwo Baah Agyemang', 'Daniel Osei', 'Anyang - Kusi Samuel', 'Mary Awusi']
pty_n = ['NPP', 'NDC', 'IND', 'IND']
cv1 = ['14,966', '9,709', '8,648', '969', '34292']
cvs1 = ['43.64', '28.31', '25.22', '2.83', '\xa0']
vot1 = ['34292', '34292', '34292', '34292']
Be sure to first change START_PAGE and END_PAGE to 100 and 350 respecively.

How to fixing scrapy json row to multiple json file

I have created a scrapy crawler to export individual item to a folder called out but I got 58 items from crawler but not getting 58 files. We just found 50 files.
Currently, I am using windows 10 and python 3
# -*- coding: utf-8 -*-
import json
import os
import random
from scrapy import Spider
from scrapy.http import Request
class AndroiddeviceSpider(Spider):
name = 'androiddevice'
allowed_domains = ['']
start_urls = ['']
def __init__(self,sr_term):
def parse(self, response):
print (response.url)
print ('\n')
listings = response.css('a:nth-of-type(2)::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
sum_meta = link.split('/')[-1]
yield Request(ac_link, meta={"sum_meta":sum_meta}, callback=self.parse_p)
# yield scrapy.Request(ac_link, callback=self.parse_p)
# checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
# if checking_last:
# checking_last = checking_last.split('?page=')[-1].split('&')[0]
# ran_ = int(checking_last)+1
# if int(checking_last) is not 1:
# for i in range(2, ran_):
# next_p = '{}&search=samsung'.format(i)
# n_link = next_p
# yield Request(n_link, callback=self.parse)
def parse_p(self, response):
sum_meta = response.meta['sum_meta']
r = response.url
r = r.split('/')[-2]
sum_meta = r
listings = response.css('th a::attr(href)').extract()
for link in listings:
ac_link = response.urljoin(link)
yield Request(ac_link, callback=self.parse_details)
checking_last = response.xpath('//*[contains(text(),"Last")]').xpath('.//#href').extract_first()
if checking_last:
checking_last = checking_last.split('?page=')[-1].split('&')[0]
ran_ = int(checking_last)+1
if int(checking_last) is not 1:
for i in range(2, ran_):
# next_p = '{}&search=samsung'.format(i)
next_p = '{}'+'?page={}'.format(sum_meta,i)
n_link = next_p
yield Request(n_link, callback=self.parse_p)
def parse_details(self, response):
url = response.url
print (url)
print ('\n')
item = {}
items = item
timezone_olson_random = [
java_vm_version = response.xpath('//tr//th[contains(text(),"java_vm_version")]//following-sibling::th//pre//text()').extract_first()
ro_product_provider = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_brand = response.xpath('//tr//th[contains(text(),"ro.product.manufacturer")]//following-sibling::th//pre//text()').extract_first()
ro_product_name = response.xpath('//tr//th[contains(text(),"")]//following-sibling::th//pre//text()').extract_first()
ro_product_model = response.xpath('//tr//th[contains(text(),"ro.product.model")]//following-sibling::th//pre//text()').extract_first()
ro_product_board = response.xpath('//tr//th[contains(text(),"ro.product.board")]//following-sibling::th//pre//text()').extract_first()
ro_build_id = response.xpath('//tr//th[contains(text(),"ro_build_id")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_incremental = response.xpath('//tr//th[contains(text(),"ro_build_version_incremental")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_release = response.xpath('//tr//th[contains(text(),"ro_build_version_release")]//following-sibling::th//pre//text()').extract_first()
ro_build_version_sdk = response.xpath('//tr//th[contains(text(),"ro_build_version_sdk")]//following-sibling::th//pre//text()').extract_first()
timezone_olson = random.choice(timezone_olson_random)
item['java_vm_version'] = java_vm_version
item['ro_product_provider'] = ro_product_provider
item['ro_product_brand'] = ro_product_brand
item['ro_product_name'] = ro_product_name
item['ro_product_model'] = ro_product_model
item['ro_product_board'] = ro_product_board
item['ro_build_id'] = ro_build_id
item['ro_build_version_incremental'] = ro_build_version_incremental
item['ro_build_version_release'] = ro_build_version_release
item['ro_build_version_sdk'] = ro_build_version_sdk
item['timezone_olson'] = timezone_olson
formatted_json = json.dumps(items, indent = 4,sort_keys=True)
with open(os.path.join('out', ro_product_model+".json"), "w") as f:
yield item
I expect the output files would be 58 items to 58 .json files into my out folder
Thank you,

