Python正则表达式错误字符范围

2022-05-21 08:05:52 发布

您现在位置:Python中文网/ 问答频道 /正文

我试过这个命令:

#Cleaning Text (RT, Punctuation etc)

#Creating new dataframe and new features
tw_list = pd.DataFrame(tweet_list)
tw_list["text"] = tw_list[0]

#Removing RT, Punctuation etc
remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([°-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
tw_list["text"] = tw_list.text.str.lower()
tw_list.head(10)

我得到了一个错误称为回溯(最近一次呼叫最后一次) 下面是结果

<ipython-input-15-e640b99d08dd> in <module>
      8 remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
      9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
---> 10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
     11 tw_list["text"] = tw_list.text.str.lower()
     12 tw_list.head(10)

c:\program files\python39\lib\site-packages\pandas\core\series.py in map(self, arg, na_action)
   3907         dtype: object
   3908         """
-> 3909         new_values = super()._map_values(arg, na_action=na_action)
   3910         return self.constructor(new_values, index=self.index).finalize_(
   3911             self, method="map"

c:\program files\python39\lib\site-packages\pandas\core\base.py in _map_values(self, mapper, na_action)
    935 
    936         # mapper is a function
--> 937         new_values = map_f(values, mapper)
    938 
    939         return new_values

pandas\_libs\lib.pyx in pandas._libs.lib.map_infer()

<ipython-input-15-e640b99d08dd> in <lambda>(x)
      7 #Removing RT, Punctuation etc
      8 remove_rt = lambda x: re.sub('RT @\w+: '," ",x)
----> 9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z \t])|(\w+:\/\/\S+)"," ",x)
     10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
     11 tw_list["text"] = tw_list.text.str.lower()

c:\program files\python39\lib\re.py in sub(pattern, repl, string, count, flags)
    208     a callable, it's passed the Match object and must return
    209     a replacement string to be used."""
--> 210     return _compile(pattern, flags).sub(repl, string, count)
    211 
    212 def subn(pattern, repl, string, count=0, flags=0):

c:\program files\python39\lib\re.py in _compile(pattern, flags)
    302     if not sre_compile.isstring(pattern):
    303         raise TypeError("first argument must be string or compiled pattern")
--> 304     p = sre_compile.compile(pattern, flags)
    305     if not (flags & DEBUG):
    306         if len(_cache) >= _MAXCACHE:

c:\program files\python39\lib\sre_compile.py in compile(p, flags)
    762     if isstring(p):
    763         pattern = p
--> 764         p = sre_parse.parse(p, flags)
    765     else:
    766         pattern = None

c:\program files\python39\lib\sre_parse.py in parse(str, flags, state)
    946 
    947     try:
--> 948         p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
    949     except Verbose:
    950         # the VERBOSE flag was switched on inside the pattern.  to be

c:\program files\python39\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
    441     start = source.tell()
    442     while True:
--> 443         itemsappend(_parse(source, state, verbose, nested + 1,
    444                            not nested and not items))
    445         if not sourcematch("|"):

c:\program files\python39\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
    832             sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
    833                            not (del_flags & SRE_FLAG_VERBOSE))
--> 834             p = _parse_sub(source, state, sub_verbose, nested + 1)
    835             if not source.match(")"):
    836                 raise source.error("missing ), unterminated subpattern",

c:\program files\python39\lib\sre_parse.py in _parse_sub(source, state, verbose, nested)
    441     start = source.tell()
    442     while True:
--> 443         itemsappend(_parse(source, state, verbose, nested + 1,
    444                            not nested and not items))
    445         if not sourcematch("|"):

c:\program files\python39\lib\sre_parse.py in _parse(source, state, verbose, nested, first)
    596                     if hi < lo:
    597                         msg = "bad character range %s-%s" % (this, that)
--> 598                         raise source.error(msg, len(this) + 1 + len(that))
    599                     setappend((RANGE, (lo, hi)))
    600                 else:

错误说明错误:错误字符范围⁰-9在第18位

我正在尝试创建新的数据帧(tw_list)和新功能(text),然后使用lambda函数和clean RT、link、标点符号清理文本,并将其转换为小写


Tags: textinpymapsourceparselibfilesprogramlistflagspatterntwrtpython39
1条回答
网友
1楼 ·

由于以下原因,您的正则表达式中存在一个问题:“…[°-9]” 如果需要此字符“°”和“-”,则需要将“-”反斜杠,以避免正则表达式尝试进行范围转换

如果是“0”而不是“°”,则只需替换“°”字符即可

您可以在此处看到您的问题: https://regex101.com/r/hhf27i/1

以下是修复方法: https://regex101.com/r/8d1VxP/1