I have written the following code to crawl tweets with 'utf-8' encoding:
kws=[]
f=codecs.open("keywords", encoding='utf-8')
kws = f.readlines()
f.close()
print kws
for kw in kws:
timeline_endpoint ='https://api.twitter.com/1.1/search/tweets.json?q='+kw+'&count=100&lang=fr'
print timeline_endpoint
response, data = client.request(timeline_endpoint)
tweets = json.loads(data)
for tweet in tweets['statuses']:
my_on_data(json.dumps(tweet.encode('utf-8')))
time.sleep(3)
but I am getting the following error:
response, data = client.request(timeline_endpoint)
File "build/bdist.linux-x86_64/egg/oauth2/__init__.py", line 676, in request
File "build/bdist.linux-x86_64/egg/oauth2/__init__.py", line 440, in to_url
File "/usr/lib/python2.7/urllib.py", line 1357, in urlencode
l.append(k + '=' + quote_plus(str(elt)))
UnicodeEncodeError: 'ascii' codec can't encode character u'\xe9' in position 1: ordinal not in range(128)
I would appreciate any help.
Okay, here is the solution using a different search approach:
auth = tweepy.OAuthHandler("k1", "k2")
auth.set_access_token("k3", "k4")
api = tweepy.API(auth)
for kw in kws:
max_tweets = 10
searched_tweets = [status for status in tweepy.Cursor(api.search, q=kw.encode('utf-8')).items(max_tweets)]
for tweet in searched_tweets:
my_on_data(json.dumps(tweet._json))
time.sleep(3)
Related
I have tried to scrape a website using Python and Selenium.
Here is the partial code:
def data_html_text(self): #Downloads page source code
Xyz_page_source = self.driver.page_source
with open(self.Html_source, 'w', encoding="utf-8") as file:
file.write(Xyz_page_source)
def email_parser(self): # gets scraped links and filters it
count = 0
file = open(self.Html_source)
data = file.read()
soup = BeautifulSoup(data, 'lxml')
all_divs = soup.find_all('li',class_='badgeList__item',)
scrapper_links = [self.Base_url + a_href.div.div.a['href'] for a_href in all_divs]
for link in scrapper_links:
count += 1
print("{} ------> {}".format(count,link))
count = 0
data = []
for s_link in scrapper_links:
user_page = requests.get(s_link, headers=self.headers)
text = user_page.content
inner_pagee = text.decode()
all_emails = re.findall(r'[w\w.-]+#[\w\.-]+', inner_pagee)
if all_emails:
count += 1
print("{} Scraping Emails: {}".format(count, all_emails[0]))
data.append(all_emails[0])
new_data = list(set(data))
data1 =[]
for x in new_data:
x = re.sub('[.]$','',x)
data1.append(x)
print(data1)
with open('test.csv', "w", encoding="utf-8") as output:
writer = csv.writer(output, lineterminator='\n')
for val in data1:
writer.writerow([val])
But I kept getting the following Error:
UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position
229393: character maps to
Any idea on how to solve this?
File you are opening is not utf-8 format, please check the format (encoding) and use that instead of utf-8.
Try
encoding='utf-8-sig'
I was trying to get data from the website and input it into csv
table_array = []
def get_data(page):
url = "https://www.sl886.com/stockrating/list?list=stockrating&page=" + str(page)
get = requests.get(url, headers=headers)
table_bs4 = BeautifulSoup(get.text, "html.parser")
main_table = table_bs4.find("table").find("tbody")
for i in main_table.find_all("tr"):
array = []
for x in i.find_all("td"):
array.append(x.text)
print(x.text, end = " ")
table_array.append(array[:-1])
print("")
def get_page_number():
url = "https://www.sl886.com/stockrating/list?list=stockrating&page=1"
get = requests.get(url, headers=headers)
table_bs4 = BeautifulSoup(get.text, "html.parser")
sum_of_items = table_bs4.find("div", {"class": "summary"})
sum_of_items = int(sum_of_items.find_all("b")[1].text.replace(",", ""))
total_pages = math.ceil(sum_of_items/20)
return total_pages
Run:
for i in range(1):#range(get_page_number()):
get_data(i + 9)
with open("big_bank_rating.csv", "w", newline="") as csvfile:
write = csv.writer(csvfile)
write.writerow(["日期", "大行", "股票", "最新評級", "目標價", "變化", "潛在升幅"])
for xi in table_array:
print(xi)
write.writerows(table_array)
print("!!!Done Append!!!")
but it comes to an error of encoding
Traceback (most recent call last):
File "data_crawl.py", line 55, in <module>
write.writerows(table_array)
UnicodeEncodeError: 'cp950' codec can't encode character '\u7dab' in position 19: illegal multibyte sequence
I tried to use encoding "uft-8" but the result I got was garbled. How can I fix it? to encode the character '\u7dab'
I have to use
encoding="utf_8_sig"
I was working on a project that requires data from instagram, i am collecting it using the json.
When i use the test link: http://vocab.nic.in/rest.php/states/json - it works, but when i use: https://www.instagram.com/instagram/?__a=1 (also json) - it didn't work.
UnicodeEncodeError: 'charmap' codec can't encode characters in position 196-197: character maps to
import urllib.request
import json
def getResponse(url):
operUrl = urllib.request.urlopen(url)
if(operUrl.getcode()==200):
data = operUrl.read()
jsonData = json.loads(data)
else:
print("Error receiving data", operUrl.getcode())
return jsonData
def main():
urlData = "https://www.instagram.com/instagram/?__a=1"
jsonData = getResponse(urlData)
# print the state id and state name corresponding
print(jsonData)
if __name__ == '__main__':
main()
Full Traceback:
Traceback (most recent call last):
File "C:\Users\andrey_user\Desktop\ig\test.py", line 21, in [module] main()
File "C:\Users\andrey_user\Desktop\ig\test.py", line 18, in main print(jsonData)
File "D:\programs\programing\Python\lib\encodings\cp1251.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
UnicodeEncodeError: 'charmap' codec can't encode characters in position 196-197: character maps to [undefined]
Here is my code, I am scraping the data of 99acres website and storing it in a csv file but when i am doing so it gives me a error message of 'charmap' codec can't encode character '\u20b9'...Please Tell me how it will be resolved...
import io
import csv
import requests
from bs4 import BeautifulSoup
response = requests.get('https://www.99acres.com/search/property/buy/residential-all/hyderabad?search_type=QS&search_location=HP&lstAcn=HP_R&lstAcnId=0&src=CLUSTER&preference=S&selected_tab=1&city=269&res_com=R&property_type=R&isvoicesearch=N&keyword_suggest=hyderabad%3B&fullSelectedSuggestions=hyderabad&strEntityMap=W3sidHlwZSI6ImNpdHkifSx7IjEiOlsiaHlkZXJhYmFkIiwiQ0lUWV8yNjksIFBSRUZFUkVOQ0VfUywgUkVTQ09NX1IiXX1d&texttypedtillsuggestion=hyder&refine_results=Y&Refine_Localities=Refine%20Localities&action=%2Fdo%2Fquicksearch%2Fsearch&suggestion=CITY_269%2C%20PREFERENCE_S%2C%20RESCOM_R&searchform=1&price_min=null&price_max=null')
html = response.text
soup = BeautifulSoup(html, 'html.parser')
list=[]
dealer = soup.findAll('div',{'class': 'srpWrap'})
for item in dealer:
try:
p = item.contents[1].find_all("div",{"class":"_srpttl srpttl fwn wdthFix480 lf"})[0].text
except:
p=''
try:
d = item.contents[1].find_all("div",{"class":"lf f13 hm10 mb5"})[0].text
except:
d=''
li=[p,d]
list.append(li)
with io.open('project.csv','w',encoding="utf-8") as file:
writer= csv.writer(file)
for row in list:
writer.writerows(row)
file.close()
Use: .encode(encoding = 'UTF-8')
I try to analyse some tweets I got from tweeter, but It seems I have a probleme of encoding, if you have any idea..
import json
#Next we will read the data in into an array that we call tweets.
tweets_data_path = 'C:/Python34/TESTS/twitter_data.txt'
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print(len(tweets_data))#412 tweets
print(tweet)
I got the mistake :
File "C:\Python34\lib\encodings\cp850.py", line 19, in encode return codecs.charmap_encode(input,self.errors,encoding_map)[0]
unicodeEncodeError: 'charpmap' codec can't encode character '\u2026' in position 1345: character maps to undefined
At work, I didn't get the error, but I have python 3.3, does it make a difference, do you think ?
-----EDIT
The comment from #MarkRamson answered my question
You should specify the encoding when opening the file:
tweets_file = open(tweets_data_path, "r", encoding="utf-8-sig")