在pandas datafram中插入新行

In [8]: df.head() Out[8]: text lemma pos markintext doublemma multiwordexpr nodetail 0 Per per epsf 0 0 0 0 1 correr correre vta2fp 0 0 0 0 2 miglior migliore a2fp 0 0 0 0 3 acque acqua sf1fp 0 0 0 0 4 alza alzare vta1ips3 0 0 0 0

text lemma pos markintext doublemma multiwordexpr 16 dietro dietro a eilksl 0 0 1 17 a dietro a eilksl 0 0 1

#!/usr/bin/python # -*- coding: latin-1 -*- from lxml import etree import locale import sys import os import glob import pandas as pd import numpy as np import re from string import punctuation import random import unicodedata def manage_tail(taillist): z = [] for line in taillist: y = list(line.strip()) for punkt in y: z.append(punkt) return z if len(z) > 0 else 0 def checkmark(text): pattern = re.compile("\w|'",re.UNICODE) if re.match(pattern,text[-1]): return 0 else: return text[-1] path = "~/working_corpus/" output_path = "~/devel_output/" f = "*.xml" docs = [f for f in glob.glob(os.path.join(path,f))] parser = etree.XMLParser(load_dtd= True,resolve_entities=True) x = [] for d in docs: tree = etree.parse(d,parser) for node in [z for z in tree.iterfind(".//LM")]: text = node.text.strip() multiwordexpr = 1 if (' ' in text.replace(' ', ' ')) else 0 lemma = node.get('lemma') markintext = checkmark(text) pos = node.get('catg') doublemma = 1 if (node.getparent() is not None and node.getparent().tag == 'LM1') else 0 nodetail = manage_tail(node.tail.splitlines()) if node.tail else None row = [text,lemma,pos,markintext,doublemma,multiwordexpr,nodetail] x.append(row) df = pd.DataFrame(x,columns=('text','lemma','pos','markintext','doublemma','multiwordexpr','nodetail'))

l = [] i = 0 while i < len(df): if (df.iloc[i,6] != 0): ntail = df.iloc[i,6] df.iloc[i,6] = 0 i += 1 for w in range(len(ntail)): line = pd.DataFrame({'text': ntail[w], 'lemma': ntail[w], 'pos':'NaN', 'markintext':0, 'doublemma':0, 'multiwordexpr':0, 'nodetail':0},index=[i+w], columns=('text','lemma','pos','markintext','doublemma','multiwordexpr','nodetail')) l.append(line) else: pass i += 1 sys.stdout.write("\r%d/%d" % (i,len(df))) sys.stdout.flush() print "...done extracting." for i in range(len(l)): start = int((l[i].index[0])-1) end = int(l[i].index[0]) df = pd.concat([df.ix[:start], l[i], df.ix[end:]]).reset_index(drop=True) sys.stdout.write("\r%d/%d" % (i,len(l))) sys.stdout.flush()

1条回答

网友

1楼 · 发布于 2024-05-14 15:52:32

编辑：您可以预先分配df，所需的长度将是len(df)+df.multiwordexpr.sum()，然后您可以使用.ix[]来设置正确的行。你仍然需要迭代原始df并将其拆分。那可能会更快。在

row = ['','','',0,0,0,0]
#calculate correct length depending on your original df
df_len = len(orig_df)+orig_df.multiwordexpr.sum()

#allocate a new df
result_df = pd.DataFrame([row for x in xrange(df_len)],
                      columns=columns)
#write to it instead appending
result_df.ix[index] = ['Per','per','epsf',0,0,0,0]

编辑结束

也许创建一个新的数据帧并只附加到它会比修改原始数据帧快吗？在

在拆分多字表达式行时，可以迭代原始df并附加到新的df。但不知道这是否会更好。在

^{pr2}$

相关问题更多 >

编程相关推荐

热门问题

热门文章