如何在Python中未使用外部库解析arff文件
我需要解析一个像下面这样的arff文件,但不想使用任何外部库。我不太确定怎么把属性和数值联系起来。比如,怎么才能说每行的第一个数值是年龄,第二个数值是性别呢?你能给我一些关于解析类似情况的Python代码链接吗?
@relation cleveland-14-heart-disease
@attribute 'age' real
@attribute 'sex' { female, male}
@attribute 'cp' { typ_angina, asympt, non_anginal, atyp_angina}
@attribute 'trestbps' real
@attribute 'chol' real
@attribute 'fbs' { t, f}
@attribute 'restecg' { left_vent_hyper, normal, st_t_wave_abnormality}
@attribute 'thalach' real
@attribute 'exang' { no, yes}
@attribute 'oldpeak' real
@attribute 'slope' { up, flat, down}
@attribute 'ca' real
@attribute 'thal' { fixed_defect, normal, reversable_defect}
@attribute 'class' { negative, positive}
@data
63,male,typ_angina,145,233,t,left_vent_hyper,150,no,2.3,down,0,fixed_defect,negative
37,male,non_anginal,130,250,f,normal,187,no,3.5,down,0,normal,negative
41,female,atyp_angina,130,204,f,left_vent_hyper,172,no,1.4,up,0,normal,negative
56,male,atyp_angina,120,236,f,normal,178,no,0.8,up,0,normal,negative
57,female,asympt,120,354,f,normal,163,yes,0.6,up,0,normal,negative
57,male,asympt,140,192,f,normal,148,no,0.4,flat,0,fixed_defect,negative
56,female,atyp_angina,140,294,f,left_vent_hyper,153,no,1.3,flat,0,normal,negative
44,male,atyp_angina,120,263,f,normal,173,no,0,up,0,reversable_defect,negative
52,male,non_anginal,172,199,t,normal,162,no,0.5,up,0,reversable_defect,negative
这是我写的一个示例代码:
arr=[]
arff_file = open("heart_train.arff")
count=0
for line in arff_file:
count+=1
#line=line.strip("\n")
#line=line.split(',')
if not (line.startswith("@")):
if not (line.startswith("%")):
line=line.strip("\n")
line=line.split(',')
arr.append(line)
print(arr[1:30])
不过输出的结果和我预期的差别很大:
[['37', 'male', 'non_anginal', '130', '250', 'f', 'normal', '187', 'no', '3.5', 'down', '0', 'normal', 'negative'], ['41', 'female', 'atyp_angina', '130', '204', 'f', 'left_vent_hyper', '172', 'no', '1.4', 'up', '0', 'normal', 'negative'], ['56', 'male', 'atyp_angina', '120', '236', 'f', 'normal', '178', 'no', '0.8', 'up', '0', 'normal', 'negative'], ['57', 'female', 'asympt', '120', '354', 'f', 'normal', '163', 'yes', '0.6', 'up', '0', 'normal', 'negative'], ['57', 'male', 'asympt', '140', '192', 'f', 'normal', '148', 'no', '0.4', 'flat', '0', 'fixed_defect', 'negative'], ['56', 'female', 'atyp_angina', '140', '294', 'f', 'left_vent_hyper', '153', 'no', '1.3', 'flat', '0', 'normal', 'negative'], ['44', 'male', 'atyp_angina', '120', '263', 'f', 'normal', '173', 'no', '0', 'up', '0', 'reversable_defect', 'negative'], ['52', 'male', 'non_anginal', '172', '199', 't', 'normal', '162', 'no', '0.5', 'up', '0', 'reversable_defect', 'negative'], ['57', 'male', 'non_anginal', '150', '168', 'f', 'normal', '174', 'no', '1.6', 'up', '0', 'normal', 'negative'], ['54', 'male', 'asympt', '140', '239', 'f', 'normal', '160', 'no', '1.2', 'up', '0', 'normal', 'negative'], ['48', 'female', 'non_anginal', '130', '275', 'f', 'normal', '139', 'no', '0.2', 'up', '0', 'normal', 'negative'], ['49', 'male', 'atyp_angina', '130', '266', 'f', 'normal', '171', 'no', '0.6', 'up', '0', 'normal', 'negative'], ['64', 'male', 'typ_angina', '110', '211', 'f', 'left_vent_hyper', '144', 'yes', '1.8', 'flat', '0', 'normal', 'negative'], ['58', 'female', 'typ_angina', '150', '283', 't', 'left_vent_hyper', '162', 'no', '1', 'up', '0', 'normal', 'negative'], ['50', 'female', 'non_anginal', '120', '219', 'f', 'normal', '158', 'no', '1.6', 'flat', '0', 'normal', 'negative'], ['58', 'female', 'non_anginal', '120', '340', 'f', 'normal', '172', 'no', '0', 'up', '0', 'normal', 'negative'], ['66', 'female', 'typ_angina', '150', '226', 'f', 'normal', '114', 'no', '2.6', 'down', '0', 'normal', 'negative'], ['43', 'male', 'asympt', '150', '247', 'f', 'normal', '171', 'no', '1.5', 'up', '0', 'normal', 'negative'], ['69', 'female', 'typ_angina', '140', '239', 'f', 'normal', '151', 'no', '1.8', 'up', '2', 'normal', 'negative'], ['59', 'male', 'asympt', '135', '234', 'f', 'normal', '161', 'no', '0.5', 'flat', '0', 'reversable_defect', 'negative'], ['44', 'male', 'non_anginal', '130', '233', 'f', 'normal', '179', 'yes', '0.4', 'up', '0', 'normal', 'negative'], ['42', 'male', 'asympt', '140', '226', 'f', 'normal', '178', 'no', '0', 'up', '0', 'normal', 'negative'], ['61', 'male', 'non_anginal', '150', '243', 't', 'normal', '137', 'yes', '1', 'flat', '0', 'normal', 'negative'], ['40', 'male', 'typ_angina', '140', '199', 'f', 'normal', '178', 'yes', '1.4', 'up', '0', 'reversable_defect', 'negative'], ['71', 'female', 'atyp_angina', '160', '302', 'f', 'normal', '162', 'no', '0.4', 'up', '2', 'normal', 'negative'], ['59', 'male', 'non_anginal', '150', '212', 't', 'normal', '157', 'no', '1.6', 'up', '0', 'normal', 'negative'], ['51', 'male', 'non_anginal', '110', '175', 'f', 'normal', '123', 'no', '0.6', 'up', '0', 'normal', 'negative'], ['65', 'female', 'non_anginal', '140', '417', 't', 'left_vent_hyper', '157', 'no', '0.8', 'up', '1', 'normal', 'negative'], ['53', 'male', 'non_anginal', '130', '197', 't', 'left_vent_hyper', '152', 'no', '1.2', 'down', '0', 'normal', 'negative']]
你知道怎么才能得到像下面这个由arff库(来自Weka)生成的输出吗?

1 个回答
5
你说“没有外部库”,那你至少能把代码复制粘贴到自己的代码里吗?你可能会觉得这个arff模块的源代码很有用(大约200行,大小约5.6 KB)。
编辑:
你可能会觉得这个格式参考很有帮助:http://weka.wikispaces.com/ARFF+%28stable+version%29
编辑2:
为了好玩,我写了自己的.arrf解析器;它的长度几乎和WEKA的代码一样,但应该更容易读懂——只有六个函数,一个调度表,还有一个非常模块化的类。你可以遍历一个类的实例,获取每一行数据,结果是一个命名元组。
看看你觉得怎么样:
from collections import namedtuple
from keyword import iskeyword
import re
def NotDone(msg):
raise NotImplemented(msg)
def nominal(spec):
"""
Create an ARFF nominal (enumerated) data type
"""
spec = spec.lstrip("{ \t").rstrip("} \t")
good_values = set(val.strip() for val in spec.split(","))
def fn(s):
s = s.strip()
if s in good_values:
return s
else:
raise ValueError("'{}' is not a recognized value".format(s))
# patch docstring
fn.__name__ = "nominal"
fn.__doc__ = """
ARFF nominal (enumerated) data type
Legal values are {}
""".format(sorted(good_values))
return fn
def numeric(s):
"""
Convert string to int or float
"""
try:
return int(s)
except ValueError:
return float(s)
field_maker = {
"date": (lambda spec: NotDone("date data type not implemented")),
"integer": (lambda spec: int),
"nominal": (lambda spec: nominal(spec)),
"numeric": (lambda spec: numeric),
"string": (lambda spec: str),
"real": (lambda spec: float),
"relational": (lambda spec: NotDone("relational data type not implemented")),
}
def file_lines(fname):
# lazy file reader; ensures file is closed when done,
# returns lines without trailing spaces or newline
with open(fname) as inf:
for line in inf:
yield line.rstrip()
def no_data_yet(*items):
raise ValueError("AarfRow not fully defined (haven't seen a @data directive yet)")
def make_field_name(s):
"""
Mangle string to make it a valid Python identifier
"""
s = s.lower() # force to lowercase
s = "_".join(re.findall("[a-z0-9]+", s)) # strip all invalid chars; join what's left with "_"
if iskeyword(s) or re.match("[0-9]", s): # if the result is a keyword or starts with a digit
s = "f_"+s # make it a safe field name
return s
class ArffReader:
line_types = ["blank", "comment", "relation", "attribute", "data"]
def __init__(self, fname):
# get input file
self.fname = fname
self.lines = file_lines(fname)
# prepare to read file header
self.relation = '(not specified)'
self.data_names = []
self.data_types = []
self.dtype = no_data_yet
# read file header
line_tests = [
(getattr(self, "line_is_{}".format(item)), getattr(self, "line_do_{}".format(item)))
for item in self.__class__.line_types
]
for line in self.lines:
for is_, do in line_tests:
if is_(line):
done = do(line)
break
if done:
break
# use header fields to build data type (and make it print as requested)
class ArffRow(namedtuple('ArffRow', self.data_names)):
__slots__ = ()
def __str__(self):
items = (getattr(self, field) for field in self._fields)
return "({})".format(", ".join(repr(it) for it in items))
self.dtype = ArffRow
#
# figure out input-line type
#
def line_is_blank(self, line):
return not line
def line_is_comment(self, line):
return line.lower().startswith('%')
def line_is_relation(self, line):
return line.lower().startswith('@relation')
def line_is_attribute(self, line):
return line.lower().startswith('@attribute')
def line_is_data(self, line):
return line.lower().startswith('@data')
#
# handle input-line type
#
def line_do_blank(self, line):
pass
def line_do_comment(self, line):
pass
def line_do_relation(self, line):
self.relation = line[10:].strip()
def line_do_attribute(self, line):
m = re.match(
"^@attribute" # line starts with '@attribute'
"\s+" #
"(" # name is one of:
"(?:'[^']+')" # ' string in single-quotes '
"|(?:\"[^\"]+\")" # " string in double-quotes "
"|(?:[^ \t'\"]+)" # single_word_string (no spaces)
")" #
"\s+" #
"(" # type is one of:
"(?:{[^}]+})" # { set, of, nominal, values }
"|(?:\w+)" # datatype
")" #
"\s*" #
"(" # spec string
".*" # anything to end of line
")$", #
line, flags=re.I) # case-insensitive
if m:
name, type_, spec = m.groups()
self.data_names.append(make_field_name(name))
if type_[0] == '{':
type_, spec = 'nominal', type_
self.data_types.append(field_maker[type_](spec))
else:
raise ValueError("failed parsing attribute line '{}'".format(line))
def line_do_data(self, line):
return True # flag end of header
#
# make the class iterable
#
def __iter__(self):
return self
def next(self):
"""
Return one data row at a time
"""
data = next(self.lines).split(',')
return self.dtype(*(fn(dat) for fn,dat in zip(self.data_types, data)))
它可以这样使用
for row in ArffReader('mydata.arff'):
print(row)
结果是
(63.0, 'male', 'typ_angina', 145.0, 233.0, 't', 'left_vent_hyper', 150.0, 'no', 2.3, 'down', 0.0, 'fixed_defect', 'negative')
(37.0, 'male', 'non_anginal', 130.0, 250.0, 'f', 'normal', 187.0, 'no', 3.5, 'down', 0.0, 'normal', 'negative')
(41.0, 'female', 'atyp_angina', 130.0, 204.0, 'f', 'left_vent_hyper', 172.0, 'no', 1.4, 'up', 0.0, 'normal', 'negative')
(56.0, 'male', 'atyp_angina', 120.0, 236.0, 'f', 'normal', 178.0, 'no', 0.8, 'up', 0.0, 'normal', 'negative')
(57.0, 'female', 'asympt', 120.0, 354.0, 'f', 'normal', 163.0, 'yes', 0.6, 'up', 0.0, 'normal', 'negative')
(57.0, 'male', 'asympt', 140.0, 192.0, 'f', 'normal', 148.0, 'no', 0.4, 'flat', 0.0, 'fixed_defect', 'negative')
(56.0, 'female', 'atyp_angina', 140.0, 294.0, 'f', 'left_vent_hyper', 153.0, 'no', 1.3, 'flat', 0.0, 'normal', 'negative')
(44.0, 'male', 'atyp_angina', 120.0, 263.0, 'f', 'normal', 173.0, 'no', 0.0, 'up', 0.0, 'reversable_defect', 'negative')
(52.0, 'male', 'non_anginal', 172.0, 199.0, 't', 'normal', 162.0, 'no', 0.5, 'up', 0.0, 'reversable_defect', 'negative')
字段也可以通过名称访问,也就是说
for patient in ArffReader('mydata.arff'):
print("{} year old {}".format(patient.age, patient.sex))
这会得到
63.0 year old male
37.0 year old male
41.0 year old female
56.0 year old male
57.0 year old female
57.0 year old male
56.0 year old female
44.0 year old male
52.0 year old male
你可以通过以下方式查看文件名
>>> print(repr(patient))
ArffRow(age=63.0, sex='male', cp='typ_angina', trestbps=145.0, chol=233.0, fbs='t', restecg='left_vent_hyper', thalach=150.0, exang='no', oldpeak=2.3, slope='down', ca=0.0, thal='fixed_defect', f_class='negative')
字段名称遵循ARFF头部的规则,强制为小写(在'class'的情况下,前面加上'f_',因为class
是Python的关键字,所以不能用作字段名)。