Using Python 3.5, what I'm looking to do is to go to the results page of an ebay search by means of generating a link, save the source code as an XML document, and iterate through every individual listing, of which there could be 1000 or more. Next I want to create a dictionary with every word that appears in every listing's title, (title only) and its corresponding frequency of appearance. So for example, if I search 'honda civic', and the thirty of the results are 'honda civic ignition switch', I'd like my results to come out as
results = {'honda':70, 'civic':60, 'igntion':30, 'switch':30, 'jdm':15, 'interior':5}
etc., etc.
Here's a link I use:
http://www.ebay.com/sch/Car-Truck-Parts-/6030/i.html?_from=R40&LH_ItemCondition=4&LH_Complete=1&LH_Sold=1&_mPrRngCbx=1&_udlo=100&_udhi=700&_nkw=honda+%281990%2C+1991%2C+1992%2C+1993%2C+1994%2C+1995%2C+1996%2C+1997%2C+1998%2C+1999%2C+2000%2C+2001%2C+2002%2C+2003%2C+2004%2C+2005%29&_sop=16
The problem I'm having is that I only get the first 50 results, instead of the X,000's of results I potentially will get with different search options. What might be a better method of going about this?
And my code:
import requests
from bs4 import BeautifulSoup
from collections import Counter
r = requests.get(url)
myfile = 'c:/users/' + myquery
fw = open(myfile + '.xml', 'w')
soup = BeautifulSoup(r.content, 'lxml')
for item in soup.find_all('ul',{'class':'ListViewInner'}):
fw.write(str(item))
fw.close()
print('...complete')
fr = open(myfile + '.xml', 'r')
wordfreq = Counter()
for i in fr:
words = i.split()
for i in words:
wordfreq[str(i)] = wordfreq[str(i)] + 1
fw2 = open(myfile + '_2.xml', 'w')
fw2.write(str(wordfreq))
fw2.close()
You are getting the first 50 results because EBay display 50 results for each page. The solution is to parse one page at time. With this search, you can use a different url:
http://www.ebay.com/sch/Car-Truck-Parts-/6030/i.html?_from=R40&LH_ItemCondition=4&LH_Complete=1&LH_Sold=1&_mPrRngCbx=1&_udlo=100&_udhi=700&_sop=16&_nkw=honda+%281990%2C+1991%2C+1992%2C+1993%2C+1994%2C+1995%2C+1996%2C+1997%2C+1998%2C+1999%2C+2000%2C+2001%2C+2002%2C+2003%2C+2004%2C+2005%29&_pgn=1&_skc=50&rt=nc
Notice a parameter _pgn=1 in the url? This is the number of the page currently displayed. If you provide a number that exceeds the number of the pages for the search, a error message will appear in a div with class "sm-md"
So you can do something like:
page = 1
url = """http://www.ebay.com/sch/Car-Truck-Parts-/6030/i.html?_from=R40&LH_ItemCondition=4&LH_Complete=1&LH_Sold=1&_mPrRngCbx=1&_udlo=100&_udhi=700&_sop
=16&_nkw=honda+%281990%2C+1991%2C+1992%2C+1993%2C+1994%2C+1995%2C+1996%2C+
1997%2C+1998%2C+1999%2C+2000%2C+2001%2C+2002%2C+2003%2C+2004%2C+2005%29&
_pgn="""+str(page)+"&_skc=50&rt=nc"
has_page = True
myfile = 'c:/users/' + myquery
fw = open(myfile + '.xml', 'w')
while has_page:
r = requests.get(url)
soup = BeautifulSoup(r.content, "lxml")
error_msg = soup.find_all('p', {'class':"sm-md"})
if len(error_msg) > 0:
has_page = False
continue
for item in soup.find_all('ul',{'class':'ListViewInner'}):
fw.write(str(item))
page+=1
fw.close()
I only tested entering the pages and printing the ul, and it worked nice
Related
So I have this code but I am having issues when the data I am scraping has commas. I want it only show on the first column but when there's a comma, the data appears on the 2nd column. Is it possible to scrape and print it on the first column only of csv without using panda? Thanks
i = 1
for url in urls:
print(f'Scraping the URL no {i}')
i += 1
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
links = []
for text in soup.find('div',class_='entry-content').find_all('div',class_='streak'):
link = text.a['href']
text = text.a.text
links.append(link)
with open("/Users/Rex/Desktop/data.csv", "a") as file_object:
file_object.write(text)
file_object.write("\n")
CSV files have rules for escaping commas within a single column so that they are not mistakenly interpreted as a new column. This escaping can be applied automatically if you use the csv module. You really only need to open the file once, so with a few more tweaks to your code
import csv
with open("/Users/Rex/Desktop/data.csv", "a", newline=None) as file_object:
csv_object = csv.writer(file_object)
i = 1
for url in urls:
print(f'Scraping the URL no {i}')
i += 1
response = requests.get(url)
soup = BeautifulSoup(response.text,'html.parser')
links = []
for text in soup.find('div',class_='entry-content').find_all('div',class_='streak'):
link = text.a['href']
text = text.a.text.strip()
# only record if we have text
if text:
links.append(link)
csv_object.writerow([text])
NOTE: This code is skipping links that do not have text.
I'm trying to scraping using Yahoo Engine. Using keyword like "python".
I have wrote this little program :
query = "python"
url = {"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb",
"https://fr.search.yahoo.com/search?p=&fr=yfp-search-sb&b=11&pz=10&pstart=5"}
def checker():
for yahoo in url:
yahooo = yahoo.replace("&fr",query + "&fr")
r = requests.get(yahooo)
soup = bs(r.text, 'html.parser')
links = soup.find_all('a')
for link in soup.find_all('a'):
a = link.get('href')
unquote(a)
print("Urls : " + a)
with open("Yahoo.txt", mode="a",encoding="utf-8") as fullz:
fullz.write(a + "\n")
fullz.close()
lines_seen = set() # holds lines already seen
outfile = open("Yahoonodup.txt", "w", encoding="utf-8")
for line in open("Yahoo.txt", "r", encoding="utf-8"):
if line not in lines_seen: # not a duplicate
outfile.write(line)
lines_seen.add(line)
outfile.close()
checker()
My output file contains some urls like this :
https://r.search.yahoo.com/cbclk2/dWU9MURCNjczQ0UwNThBNDk4MyZ1dD0xNjE2ODAzMTA5MDE4JnVvPTg0OTM3NTA2NTgyMzY5Jmx0PTImcz0xJmVzPVdHbFZxQzRHUFNfemNveGNLaUgxVkpoX3lXV2N2WFhiQkRfZklRLS0-/RV=2/RE=1616831909/RO=10/RU=https%3a%2f%2fwww.bing.com%2faclick%3fld%3de8BWTO-5A13W9y2D2Aw39AjjVUCUyb98EJf6bSa7R7dGxGXelKfNh7KW94OonXABpN7Bo9YkZqB22Evk3cfTIpJi3aGEXXKJMtDqnaNUDUVcsehzFOYyr09GoYqUE-iUywRWeOnV4aeACKf4_YX6dE2BVZAbqkvWj4HQMqeB_Fl1KlwT1v%26u%3daHR0cHMlM2ElMmYlMmZ2ZXJnbGVpY2guZm9jdXMuZGUlMmZ3YXNjaG1hc2NoaW5lJTJmJTNmY2hhbm5lbCUzZGJpbmclMjZkZXZpY2UlM2RjJTI2bmV0d29yayUzZG8lMjZjYW1wYWlnbiUzZDQwNzE4NzU1MCUyNmFkZ3JvdXAlM2QxMzU4OTk2OTA3NDAxNDE4JTI2dGFyZ2V0JTNka3dkLTg0OTM3NjAxMjIzNjUyJTNhbG9jLTcyJTI2YWQlM2Q4NDkzNzUwNjU4MjM2OSUyNmFkLWV4dGVuc2lvbiUzZA%26rlid%3d0fc40f09a4b6109e9c726f57d193ec0e/RK=2/RS=3w4U9AT_OQyaVSF.6KLwzWuo_LU-;_ylc=cnQDMQ--?IG=0ac9439bcf3f4ec087000000005bf464
And I want to change it into the real links :
https://vergleich.focus.de/waschmaschine/?channel=bing&device=c&network=o&campaign=407187550&adgroup=1358996907401418&target=kwd-84937601223652:loc-72&ad=84937506582369&ad-extension=
It's possible ?
As seen here the response will return the url of the site that was responsible of returning the content. Meaning that for your example, you can do something like this.
url = 'https://r.search.yahoo.com/cbclk2/dWU9MURCNjczQ0UwNThBNDk4MyZ1dD0xNjE2ODAzMTA5MDE4JnVvPTg0OTM3NTA2NTgyMzY5Jmx0PTImcz0xJmVzPVdHbFZxQzRHUFNfemNveGNLaUgxVkpoX3lXV2N2WFhiQkRfZklRLS0-/RV=2/RE=1616831909/RO=10/RU=https%3a%2f%2fwww.bing.com%2faclick%3fld%3de8BWTO-5A13W9y2D2Aw39AjjVUCUyb98EJf6bSa7R7dGxGXelKfNh7KW94OonXABpN7Bo9YkZqB22Evk3cfTIpJi3aGEXXKJMtDqnaNUDUVcsehzFOYyr09GoYqUE-iUywRWeOnV4aeACKf4_YX6dE2BVZAbqkvWj4HQMqeB_Fl1KlwT1v%26u%3daHR0cHMlM2ElMmYlMmZ2ZXJnbGVpY2guZm9jdXMuZGUlMmZ3YXNjaG1hc2NoaW5lJTJmJTNmY2hhbm5lbCUzZGJpbmclMjZkZXZpY2UlM2RjJTI2bmV0d29yayUzZG8lMjZjYW1wYWlnbiUzZDQwNzE4NzU1MCUyNmFkZ3JvdXAlM2QxMzU4OTk2OTA3NDAxNDE4JTI2dGFyZ2V0JTNka3dkLTg0OTM3NjAxMjIzNjUyJTNhbG9jLTcyJTI2YWQlM2Q4NDkzNzUwNjU4MjM2OSUyNmFkLWV4dGVuc2lvbiUzZA%26rlid%3d0fc40f09a4b6109e9c726f57d193ec0e/RK=2/RS=3w4U9AT_OQyaVSF.6KLwzWuo_LU-;_ylc=cnQDMQ--?IG=0ac9439bcf3f4ec087000000005bf464'
response = requests.get(url)
print(response.url) ## this will give you 'https://vergleich.focus.de/waschmaschine/?channel=bing&device=c&network=o&campaign=407187550&adgroup=1358996907401418&target=kwd-84937601223652:loc-72&ad=84937506582369&ad-extension='
Parsing HTML.txt files, which are basically portions of HTMl. I have what I thought would work but it's repeating the same address for all the keys (locations). Can anyone spot a solution?
with open('AL.txt','r') as f:
contents = f.read()
soup = BeautifulSoup(contents, 'html.parser')
from collections import defaultdict
result = defaultdict(list)
for div in soup.find_all('div'):
for h in soup.find_all('h2'):
location = h.find('a').text
for p in soup.find_all('p'):
p = p.text.replace('\n','|').replace('\t','').strip()
clean = ' '.join(p.split()).replace('| ','|').replace(' |','|').replace('||','|')
address_clean = clean.replace('| ','|').replace(' |','|').replace('||','|')
result[location]=[address_clean]
result
what I am getting right now:
I am a complete programming beginner, so please forgive me if I am not able to express my problem very well. I am trying to write a script that will look through a series of pages of news and will record the article titles and their links. I have managed to get that done for the first page, the problem is getting the content of the subsequent pages. By searching in stackoverflow, I think I managed to find a solution that will make the script access more than one URL BUT it seems to be overwriting the content extracted from each page it accesses so I always end up with the same number of recorded articles in the file. Something that might help: I know that URLs follow the following model: "/ultimas/?page=1", "/ultimas/?page=2", etc. and it appears to be using AJAX to request new articles
Here is my code:
import csv
import requests
from bs4 import BeautifulSoup as Soup
import urllib
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
letters[0]
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
letters[0].a["href"]
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
import os, csv
os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
import json
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
Any help on how I might go about adding the content of each page to the final file would be very appreciated. Thank you!
How about this one if serving the same purpose:
import csv, requests
from lxml import html
base_url = "http://agenciabrasil.ebc.com.br"
program_url = base_url + "/ultimas/?page={0}"
outfile = open('scraped_data.csv', 'w', newline='')
writer = csv.writer(outfile)
writer.writerow(["Caption","Link"])
for url in [program_url.format(page) for page in range(1, 4)]:
response = requests.get(url)
tree = html.fromstring(response.text)
for title in tree.xpath("//div[#class='noticia']"):
caption = title.xpath('.//span[#class="field-content"]/a/text()')[0]
policy = title.xpath('.//span[#class="field-content"]/a/#href')[0]
writer.writerow([caption , base_url + policy])
It looks like the code in your for loop (for page in range(1, 4):) isn't been called due to your file not been correctly indented:
If you tidy up your code, it works:
import csv, requests, os, json, urllib
from bs4 import BeautifulSoup as Soup
r = base_url = "http://agenciabrasil.ebc.com.br/"
program_url = base_url + "/ultimas/?page="
for page in range(1, 4):
url = "%s%d" % (program_url, page)
soup = Soup(urllib.urlopen(url))
letters = soup.find_all("div", class_="titulo-noticia")
lobbying = {}
for element in letters:
lobbying[element.a.get_text()] = {}
prefix = "http://agenciabrasil.ebc.com.br"
for element in letters:
lobbying[element.a.get_text()]["link"] = prefix + element.a["href"]
for item in lobbying.keys():
print item + ": " + "\n\t" + "link: " + lobbying[item]["link"] + "\n\t"
#os.chdir("...")
with open("lobbying.csv", "w") as toWrite:
writer = csv.writer(toWrite, delimiter=",")
writer.writerow(["name", "link",])
for a in lobbying.keys():
writer.writerow([a.encode("utf-8"), lobbying[a]["link"]])
with open("lobbying.json", "w") as writeJSON:
json.dump(lobbying, writeJSON)
print "Fim"
I'd like to get some quick help on writing this webscraping program. So far it's scraping things correctly, but I'm having trouble writing it to a csv file.
I'm scraping two things from each reviewer: Review score AND written review
I'd like to write the review score into the first column, and the written review into the second column. However, writerow only does it row by row.
Appreciate any help on this! :)
import os, requests, csv
from bs4 import BeautifulSoup
# Get URL of the page
URL = ('https://www.tripadvisor.com/Attraction_Review-g294265-d2149128-Reviews-Gardens_by_the_Bay-Singapore.html')
with open('GardensbytheBay.csv', 'w', newline='') as csvfile:
writer = csv.writer(csvfile)
# Looping until the 5th page of reviews
for pagecounter in range(3):
# Request get the first page
res = requests.get(URL)
res.raise_for_status
# Download the html of the first page
soup = BeautifulSoup(res.text, "html.parser")
# Match it to the specific tag for all 5 ratings
reviewElems = soup.findAll('img', {'class': ['sprite-rating_s_fill rating_s_fill s50', 'sprite-rating_s_fill rating_s_fill s40', 'sprite-rating_s_fill rating_s_fill s30', 'sprite-rating_s_fill rating_s_fill s20', 'sprite-rating_s_fill rating_s_fill s10']})
reviewWritten = soup.findAll('p', {'class':'partial_entry'})
if reviewElems:
for row, rows in zip(reviewElems, reviewWritten):
review_text = row.attrs['alt'][0]
review2_text = rows.get_text(strip=True).encode('utf8', 'ignore').decode('latin-1')
writer.writerow([review_text])
writer.writerow([review2_text])
print('Writing page', pagecounter + 1)
else:
print('Could not find clue.')
# Find URL of next page and update URL
if pagecounter == 0:
nextLink = soup.select('a[data-offset]')[0]
elif pagecounter != 0:
nextLink = soup.select('a[data-offset]')[1]
URL = 'http://www.tripadvisor.com' + nextLink.get('href')
print('Download complete')
You can put the review score and text in the same row but different columns with:
writer.writerow([review_text, review2_text])
Your initial approach takes each of the items as a separate row and writes them in succession which is not what you want.
You can use pandas dataFrame:
import pandas as pd
import numpy as np
csv_file = pd.read_csv('GardensbytheBay.csv')
csv_file.insert(idx, cloname, value)
csv_input.to_csv('output.csv', index=False)