Morphodita库绑定

ufal.morphodita的Python项目详细描述


ufal.morphodita

ufal.morphodita是绑定到morphodita库的python<;http://ufal.mff.cuni.cz/morphodita>;。

绑定是C++bindings api的直接转换。 在python 2中,字符串可以是unicode和utf-8编码的str,并且 库始终生成unicode在python 3中,字符串只能是str

<包装C++ API > 正在包装的C++ API如下。对于原始API的引用 C++ API,参见& lt;http://ufal.mff.cuni.cz/morphodita/api-reference&gt;

Helper Structures
-----------------

  typedef vector<int> Indices;

  typedef vector<string> Forms;

  struct TaggedForm {
    string form;
    string tag;
  };
  typedef vector<TaggedForm> TaggedForms;

  struct TaggedLemma {
    string lemma;
    string tag;
  };
  typedef vector<TaggedLemma> TaggedLemmas;
  typedef vector<TaggedLemmas> Analyses;

  struct TaggedLemmaForms {
    string lemma;
    TaggedForms forms;
  };
  typedef vector<TaggedLemmaForms> TaggedLemmasForms;

  struct TokenRange {
    size_t start;
    size_t length;
  };
  typedef vector<TokenRange> TokenRanges;

  struct DerivatedLemma {
    std::string lemma;
  };
  typedef vector<DerivatedLemma> DerivatedLemmas;


Main Classes
------------

  class Version {
   public:
    unsigned major;
    unsigned minor;
    unsigned patch;
    string prerelease;

    static Version current();
  };

  class Tokenizer {
   public:
    virtual void setText(const char* text);
    virtual bool nextSentence(Forms* forms, TokenRanges* tokens);

    static Tokenizer* newVerticalTokenizer();
    static Tokenizer* newCzechTokenizer();
    static Tokenizer* newEnglishTokenizer();
    static Tokenizer* newGenericTokenizer();
  };

  class Derivator {
   public:
    virtual bool parent(const char* lemma, DerivatedLemma& parent) const;
    virtual bool children(const char* lemma, DerivatedLemmas& children) const;
  };

  class DerivationFormatter {
   public:
    virtual string formatDerivation(const char* lemma) const;

    static DerivationFormatter* newNoneDerivationFormatter();
    static DerivationFormatter* newRootDerivationFormatter(const Derivator* derivator);
    static DerivationFormatter* newPathDerivationFormatter(const Derivator* derivator);
    static DerivationFormatter* newTreeDerivationFormatter(const Derivator* derivator);
    static DerivationFormatter* newDerivationFormatter(const char* name, const Derivator* derivator);
  };

  class Morpho {
   public:
    static Morpho* load(const char* fname);

    enum { NO_GUESSER = 0, GUESSER = 1 };

    virtual int analyze(const char* form, int guesser, TaggedLemmas& lemmas) const;
    virtual int generate(const char* lemma, const char* tag_wildcard, int guesser, TaggedLemmasForms& forms) const;
    virtual string rawLemma(const char* lemma) const;
    virtual string lemmaId(const char* lemma) const;
    virtual string rawForm(const char* form) const;

    virtual Tokenizer* newTokenizer() const;

    virtual Derivator* getDerivator() const;
  };

  class Tagger {
   public:
    static Tagger* load(const char* fname);

    virtual const Morpho* getMorpho() const;

    virtual void tag(const Forms& forms, TaggedLemmas& tags, int guesser = -1) const;

    virtual void tagAnalyzed(const Forms& forms, const Analyses& analyses, Indices& tags) const;

    Tokenizer* newTokenizer() const;
  };

  class TagsetConverter {
   public:
    static TagsetConverter* newIdentityConverter();
    static TagsetConverter* newPdtToConll2009Converter();
    static TagsetConverter* newStripLemmaCommentConverter(const Morpho& morpho);
    static TagsetConverter* newStripLemmaIdConverter(const Morpho& morpho);

    virtual void convert(TaggedLemma& lemma) const;
    virtual void convertAnalyzed(TaggedLemmas& lemmas) const;
    virtual void convertGenerated(TaggedLemmasForms& forms) const;
  };

示例

运行morpho cli

执行形态分析和生成的简单示例:

import sys

from ufal.morphodita import *

# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
  import codecs
  import locale
  encoding = locale.getpreferredencoding()
  sys.stdin = codecs.getreader(encoding)(sys.stdin)
  sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if len(sys.argv) < 2:
  sys.stderr.write('Usage: %s dict_file\n' % sys.argv[0])
  sys.exit(1)

sys.stderr.write('Loading dictionary: ')
morpho = Morpho.load(sys.argv[1])
if not morpho:
  sys.stderr.write("Cannot load dictionary from file '%s'\n" % sys.argv[1])
  sys.exit(1)
sys.stderr.write('done\n')

lemmas = TaggedLemmas()
lemmas_forms = TaggedLemmasForms()
line = sys.stdin.readline()
while line:
  tokens = line.rstrip('\r\n').split('\t')
  if len(tokens) == 1: # analyze
    result = morpho.analyze(tokens[0], morpho.GUESSER, lemmas)

    guesser = "Guesser " if result == morpho.GUESSER else ""
    for lemma in lemmas:
      sys.stdout.write('%sLemma: %s %s\n' % (guesser, lemma.lemma, lemma.tag))
  elif len(tokens) == 2: # generate
    result = morpho.generate(tokens[0], tokens[1], morpho.GUESSER, lemmas_forms)

    guesser = "Guesser " if result == morpho.GUESSER else ""
    for lemma_forms in lemmas_forms:
      sys.stdout.write('%sLemma: %s\n' % (guesser, lemma_forms.lemma))
      for form in lemma_forms.forms:
        sys.stdout.write('  %s %s\n' % (form.form, form.tag))

  line = sys.stdin.readline()

运行标记器

执行标记化和PoS标记的简单示例:

import sys

from ufal.morphodita import *

def encode_entities(text):
  return text.replace('&', '&amp;').replace('<', '&lt;').replace('>', '&gt;').replace('"', '&quot;')

# In Python2, wrap sys.stdin and sys.stdout to work with unicode.
if sys.version_info[0] < 3:
  import codecs
  import locale
  encoding = locale.getpreferredencoding()
  sys.stdin = codecs.getreader(encoding)(sys.stdin)
  sys.stdout = codecs.getwriter(encoding)(sys.stdout)

if len(sys.argv) == 1:
  sys.stderr.write('Usage: %s tagger_file\n' % sys.argv[0])
  sys.exit(1)

sys.stderr.write('Loading tagger: ')
tagger = Tagger.load(sys.argv[1])
if not tagger:
  sys.stderr.write("Cannot load tagger from file '%s'\n" % sys.argv[1])
  sys.exit(1)
sys.stderr.write('done\n')

forms = Forms()
lemmas = TaggedLemmas()
tokens = TokenRanges()
tokenizer = tagger.newTokenizer()
if tokenizer is None:
  sys.stderr.write("No tokenizer is defined for the supplied model!")
  sys.exit(1)

not_eof = True
while not_eof:
  text = ''

  # Read block
  while True:
    line = sys.stdin.readline()
    not_eof = bool(line)
    if not not_eof: break
    line = line.rstrip('\r\n')
    text += line
    text += '\n';
    if not line: break



  # Tag
  tokenizer.setText(text)
  t = 0
  while tokenizer.nextSentence(forms, tokens):
    tagger.tag(forms, lemmas)

    for i in range(len(lemmas)):
      lemma = lemmas[i]
      token = tokens[i]
      sys.stdout.write('%s%s<token lemma="%s" tag="%s">%s</token>%s' % (
        encode_entities(text[t : token.start]),
        "<sentence>" if i == 0 else "",
        encode_entities(lemma.lemma),
        encode_entities(lemma.tag),
        encode_entities(text[token.start : token.start + token.length]),
        "</sentence>" if i + 1 == len(lemmas) else "",
      ))
      t = token.start + token.length
  sys.stdout.write(encode_entities(text[t : ]))

作者

米兰斯特拉卡<;straka@ufal.mff.cuni.cz>;

雅娜·斯特拉科娃<;strakova@ufal.mff.cuni.cz>

欢迎加入QQ群-->: 979659372 Python中文网_新手群

推荐PyPI第三方库


热门话题
java类。getResource和ClassLoader。getSystemResource:有没有理由选择其中一个而不是另一个?   在Java中以编程方式粘贴后恢复剪贴板   Java字符串到日期没有时间   JavaSpring注释:@Component起作用,@Repository不起作用   java“addScript”在HSQL中是否有最大记录计数?   java如何将值从JDialog框返回到父JFrame?   java我的模块库的用户有没有办法访问尚未导出的类?   java javac:未找到命令   java如何解决jsoup错误:无法找到请求目标的有效证书路径   类中的java作用域变量   Java中集合实现中的arraylist add()方法不起作用   java如何使用while循环和从用户接收输入来近似Pi?   java Spring安全CSRF培训模式   在安卓系统中,如何通过在警报框外单击来限制用户?