Question

我通过他们的API提取Twitter数据，其中一条推文有一个特殊的字符（右撇号），我不断收到一条错误，说Python不能对字符进行映射或字符映射。我已经浏览了整个互联网，但我还没有找到解决这个问题的方法。我只想用Python会识别的撇号或空字符串（基本上删除它）替换该字符。我使用的是Python 3.3。有关如何解决此问题的任何输入？这看似简单，但我是Python的新手。

编辑：这是我用来尝试过滤掉抛出错误的unicode字符的函数。

@staticmethod
def UnicodeFilter(var):
    temp = var
    temp = temp.replace(chr(2019), "'")
    temp = Functions.ToSQL(temp)
    return temp

此外，运行程序时，我的错误如下。

＆＃39;字符表＆＃39;编解码器不能编码字符＆＃39; \ u2019＆＃39;在位置59：字符映射到＆＃39; undefined＆＃39;

编辑：以下是我的源代码示例：

import json
import mysql.connector
import unicodedata
from MySQLCL import MySQLCL

class Functions(object):
"""This is a class for Python functions"""

@staticmethod
def Clean(string):
    temp = str(string)
    temp = temp.replace("'", "").replace("(", "").replace(")", "").replace(",", "").strip()
    return temp

@staticmethod
def ParseTweet(string):
    for x in range(0, len(string)):
        tweetid = string[x]["id_str"]
        tweetcreated = string[x]["created_at"]
        tweettext = string[x]["text"]
        tweetsource = string[x]["source"]
        truncated = string[x]["truncated"]
        inreplytostatusid = string[x]["in_reply_to_status_id"]
        inreplytouserid = string[x]["in_reply_to_user_id"]
        inreplytoscreenname = string[x]["in_reply_to_screen_name"]
        geo = string[x]["geo"]
        coordinates = string[x]["coordinates"]
        place = string[x]["place"]
        contributors = string[x]["contributors"]
        isquotestatus = string[x]["is_quote_status"]
        retweetcount = string[x]["retweet_count"]
        favoritecount = string[x]["favorite_count"]
        favorited = string[x]["favorited"]
        retweeted = string[x]["retweeted"]
        possiblysensitive = string[x]["possibly_sensitive"]
        language = string[x]["lang"]

        print(Functions.UnicodeFilter(tweettext))
        #print("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + Functions.UnicodeFilter(tweettext) + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + str(language) + "', '" + Functions.ToSQL(tweetcreated) + "', '" + Functions.ToSQL(tweetsource) + "', " + str(possiblysensitive) + ")")
        #MySQLCL.Set("INSERT INTO tweet(ExTweetID, TweetText, Truncated, InReplyToStatusID, InReplyToUserID, InReplyToScreenName, IsQuoteStatus, RetweetCount, FavoriteCount, Favorited, Retweeted, Language, TweetDate, TweetSource, PossiblySensitive) VALUES (" + str(tweetid) + ", '" + tweettext + "', " + str(truncated) + ", " + Functions.CheckNull(inreplytostatusid) + ", " + Functions.CheckNull(inreplytouserid) + ", '" + Functions.CheckNull(inreplytoscreenname) + "', " + str(isquotestatus) + ", " + str(retweetcount) + ", " + str(favoritecount) + ", " + str(favorited) + ", " + str(retweeted) + ", '" + language + "', '" + tweetcreated + "', '" + str(tweetsource) + "', " + str(possiblysensitive) + ")")

@staticmethod
def ToBool(variable):
    if variable.lower() == 'true':
        return True
    elif variable.lower() == 'false':
        return False

@staticmethod
def CheckNull(var):
    if var == None:
        return ""
    else:
        return var

@staticmethod
def ToSQL(var):
    temp = var
    temp = temp.replace("'", "''")
    return str(temp)

@staticmethod
def UnicodeFilter(var):
    temp = var
    #temp = temp.replace(chr(2019), "'")
    unicodestr = unicode(temp, 'utf-8')
    if unicodestr != temp:
        temp = "'"
    temp = Functions.ToSQL(temp)
    return temp

ekhumoro's回复是正确的。

Answer 1

您的计划似乎存在两个问题。

首先，您将错误的代码点传递给chr()。字符’的十六进制代码点为0x2019，但您传入十进制号2019（相当于以十六进制表示的0x7e3。所以你需要做任何一件事：

    temp = temp.replace(chr(0x2019), "'") # hexadecimal

或：

    temp = temp.replace(chr(8217), "'") # decimal

以便正确替换角色。

其次，您收到错误的原因是因为程序的某些其他部分（可能是数据库后端）正在尝试使用除UTF-8之外的某些编码来编码unicode字符串。很难对此更加准确，因为你没有在你的问题中包含完整的追溯。但是，对“charmap”的引用表明正在使用Windows代码页（但不是cp1252）;或iso编码（但不是iso8859-1，又名latin1）;或者可能是KOI8_R。

无论如何，处理此问题的正确方法是确保程序的所有部分（尤其是数据库）都使用UTF-8。如果你这样做，你将不再需要更换字符了。

Answer 2

您可以编码您的unicode字符串以转换为str：

类型

 a=u"dataàçççñññ"
type(a)
a.encode('ascii','ignore')

通过这种方式删除特殊字符会返回数据＆＃39;。

其他方式可以使用unicodedata

Answer 3

--no-daemon

如何在Python中替换Unicode字符？

3 个答案: