绑定到UDPipe库
ufal.udpipe的Python项目详细描述
ufal.udpipe
ufal.udpipe是绑定到udpipe库的python<;http://ufal.mff.cuni.cz/udpipe>;。
绑定是C++bindings api的直接转换。 在python 2中,字符串可以是unicode和utf-8编码的str,并且 库始终生成unicode。在python 3中,字符串只能是str。
包装C++API 正在封装的C++ API。对于原始API的引用
C++ API,参见& lt;http://ufal.mff.cuni.cz/udpipe/api-reference&gt;
Helper Structures
-----------------
typedef vector<int> Children;
typedef vector<string> Comments;
class ProcessingError {
public:
bool occurred();
string message;
};
class Token {
public:
string form;
string misc;
Token(const string& form = string(), const string& misc = string());
// CoNLL-U defined SpaceAfter=No feature
bool getSpaceAfter() const;
void setSpaceAfter(bool space_after);
// UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features
string getSpacesBefore() const;
void setSpacesBefore(const string& spaces_before);
string getSpacesAfter() const;
void setSpacesAfter(const string& spaces_after);
string getSpacesInToken() const;
void setSpacesInToken(const string& spaces_in_token);
// UDPipe-specific TokenRange feature
bool getTokenRange() const;
size_t getTokenRangeStart() const;
size_t getTokenRangeEnd() const;
void setTokenRange(size_t start, size_t end);
};
class Word : public Token {
public:
// form and misc are inherited from token
int id; // 0 is root, >0 is sentence word, <0 is undefined
string lemma; // lemma
string upostag; // universal part-of-speech tag
string xpostag; // language-specific part-of-speech tag
string feats; // list of morphological features
int head; // head, 0 is root, <0 is undefined
string deprel; // dependency relation to the head
string deps; // secondary dependencies
Children children;
Word(int id = -1, const string& form = string());
};
typedef vector<Word> Words;
class MultiwordToken : public Token {
public:
// form and misc are inherited from token
int idFirst, idLast;
MultiwordToken(int id_first = -1, int id_last = -1, const string& form = string(), const string& misc = string());
};
typedef vector<MultiwordToken> MultiwordTokens;
class EmptyNode {
public:
int id; // 0 is root, >0 is sentence word, <0 is undefined
int index; // index for the current id, should be numbered from 1, 0=undefined
string form; // form
string lemma; // lemma
string upostag; // universal part-of-speech tag
string xpostag; // language-specific part-of-speech tag
string feats; // list of morphological features
string deps; // secondary dependencies
string misc; // miscellaneous information
EmptyNode(int id = -1, int index = 0) : id(id), index(index) {}
};
typedef vector<empty_node> EmptyNodes;
class Sentence {
public:
Sentence();
Words words;
MultiwordTokens multiwordTokens;
EmptyNodes emptyNodes;
Comments comments;
static const string rootForm;
// Basic sentence modifications
bool empty();
void clear();
virtual Word& addWord(const char* form);
void setHead(int id, int head, const string& deprel);
void unlinkAllWords();
// CoNLL-U defined comments
bool getNewDoc() const;
string getNewDocId() const;
void setNewDoc(bool new_doc, const string& id = string());
bool getNewPar() const;
string getNewParId() const;
void setNewPar(bool new_par, const string& id = string());
string getSentId() const;
void setSentId(const string& id);
string getText() const;
void setText(const string& id);
};
typedef vector<Sentence> Sentences;
Main Classes
------------
class InputFormat {
public:
virtual void resetDocument(const string& id = string());
virtual void setText(const char* text);
virtual bool nextSentence(Sentence& s, ProcessingError* error = nullptr);
static InputFormat* newInputFormat(const string& name);
static InputFormat* newConlluInputFormat(const string& id = string());
static InputFormat* newGenericTokenizerInputFormat(const string& id = string());
static InputFormat* newHorizontalInputFormat(const string& id = string());
static InputFormat* newVerticalInputFormat(const string& id = string());
static InputFormat* newPresegmentedTokenizer(InputFormat tokenizer);
static const string CONLLU_V1;
static const string CONLLU_V2;
static const string GENERIC_TOKENIZER_NORMALIZED_SPACES;
static const string GENERIC_TOKENIZER_PRESEGMENTED;
static const string GENERIC_TOKENIZER_RANGES;
};
class OutputFormat {
public:
virtual string writeSentence(const Sentence& s);
virtual string finishDocument();
static OutputFormat* newOutputFormat(const string& name);
static OutputFormat* newConlluOutputFormat(const string& options = string());
static OutputFormat* newEpeOutputFormat(const string& options = string());
static OutputFormat* newMatxinOutputFormat(const string& options = string());
static OutputFormat* newHorizontalOutputFormat(const string& options = string());
static OutputFormat* newPlaintextOutputFormat(const string& options = string());
static OutputFormat* newVerticalOutputFormat(const string& options = string());
static const string CONLLU_V1;
static const string CONLLU_V2;
static const string HORIZONTAL_PARAGRAPHS;
static const string PLAINTEXT_NORMALIZED_SPACES;
static const string VERTICAL_PARAGRAPHS;
};
class Model {
public:
static Model* load(const char* fname);
virtual InputFormat* newTokenizer(const string& options) const;
virtual bool tag(Sentence& s, const string& options, ProcessingError* error = nullptr) const;
virtual bool parse(Sentence& s, const string& options, ProcessingError* error) const;
static const string DEFAULT;
static const string TOKENIZER_PRESEGMENTED;
};
class Pipeline {
public:
Pipeline(const Model* m, const string& input, const string& tagger, const string& parser, const string& output);
void setModel(const Model* m);
void setInput(const string& input);
void setTagger(const string& tagger);
void setParser(const string& parser);
void setOutput(const string& output);
void setImmediate(bool immediate);
void setDocumentId(const string& document_id);
string process(const string& data, ProcessingError* error = nullptr) const;
static const string DEFAULT;
static const string NONE;
};
class Trainer {
public:
static string train(const string& method, const Sentences& train, const Sentences& heldout,
const string& tokenizer, const string& tagger, const string& parser,
ProcessingError* error = nullptr);
static const string DEFAULT;
static const string NONE;
};
class Evaluator {
public:
Evaluator(const Model* m, const string& tokenizer, const string& tagger, const string& parser);
void setModel(const Model* m);
void setTokenizer(const string& tokenizer);
void setTagger(const string& tagger);
void setParser(const string& parser);
string evaluate(const string& data, ProcessingError* error = nullptr) const;
static const string DEFAULT;
static const string NONE;
};
class Version {
public:
unsigned major;
unsigned minor;
unsigned patch;
string prerelease;
// Returns current version.
static version current();
};
正在封装的C++ API。对于原始API的引用 C++ API,参见& lt;http://ufal.mff.cuni.cz/udpipe/api-reference&gt;
Helper Structures ----------------- typedef vector<int> Children; typedef vector<string> Comments; class ProcessingError { public: bool occurred(); string message; }; class Token { public: string form; string misc; Token(const string& form = string(), const string& misc = string()); // CoNLL-U defined SpaceAfter=No feature bool getSpaceAfter() const; void setSpaceAfter(bool space_after); // UDPipe-specific all-spaces-preserving SpacesBefore and SpacesAfter features string getSpacesBefore() const; void setSpacesBefore(const string& spaces_before); string getSpacesAfter() const; void setSpacesAfter(const string& spaces_after); string getSpacesInToken() const; void setSpacesInToken(const string& spaces_in_token); // UDPipe-specific TokenRange feature bool getTokenRange() const; size_t getTokenRangeStart() const; size_t getTokenRangeEnd() const; void setTokenRange(size_t start, size_t end); }; class Word : public Token { public: // form and misc are inherited from token int id; // 0 is root, >0 is sentence word, <0 is undefined string lemma; // lemma string upostag; // universal part-of-speech tag string xpostag; // language-specific part-of-speech tag string feats; // list of morphological features int head; // head, 0 is root, <0 is undefined string deprel; // dependency relation to the head string deps; // secondary dependencies Children children; Word(int id = -1, const string& form = string()); }; typedef vector<Word> Words; class MultiwordToken : public Token { public: // form and misc are inherited from token int idFirst, idLast; MultiwordToken(int id_first = -1, int id_last = -1, const string& form = string(), const string& misc = string()); }; typedef vector<MultiwordToken> MultiwordTokens; class EmptyNode { public: int id; // 0 is root, >0 is sentence word, <0 is undefined int index; // index for the current id, should be numbered from 1, 0=undefined string form; // form string lemma; // lemma string upostag; // universal part-of-speech tag string xpostag; // language-specific part-of-speech tag string feats; // list of morphological features string deps; // secondary dependencies string misc; // miscellaneous information EmptyNode(int id = -1, int index = 0) : id(id), index(index) {} }; typedef vector<empty_node> EmptyNodes; class Sentence { public: Sentence(); Words words; MultiwordTokens multiwordTokens; EmptyNodes emptyNodes; Comments comments; static const string rootForm; // Basic sentence modifications bool empty(); void clear(); virtual Word& addWord(const char* form); void setHead(int id, int head, const string& deprel); void unlinkAllWords(); // CoNLL-U defined comments bool getNewDoc() const; string getNewDocId() const; void setNewDoc(bool new_doc, const string& id = string()); bool getNewPar() const; string getNewParId() const; void setNewPar(bool new_par, const string& id = string()); string getSentId() const; void setSentId(const string& id); string getText() const; void setText(const string& id); }; typedef vector<Sentence> Sentences; Main Classes ------------ class InputFormat { public: virtual void resetDocument(const string& id = string()); virtual void setText(const char* text); virtual bool nextSentence(Sentence& s, ProcessingError* error = nullptr); static InputFormat* newInputFormat(const string& name); static InputFormat* newConlluInputFormat(const string& id = string()); static InputFormat* newGenericTokenizerInputFormat(const string& id = string()); static InputFormat* newHorizontalInputFormat(const string& id = string()); static InputFormat* newVerticalInputFormat(const string& id = string()); static InputFormat* newPresegmentedTokenizer(InputFormat tokenizer); static const string CONLLU_V1; static const string CONLLU_V2; static const string GENERIC_TOKENIZER_NORMALIZED_SPACES; static const string GENERIC_TOKENIZER_PRESEGMENTED; static const string GENERIC_TOKENIZER_RANGES; }; class OutputFormat { public: virtual string writeSentence(const Sentence& s); virtual string finishDocument(); static OutputFormat* newOutputFormat(const string& name); static OutputFormat* newConlluOutputFormat(const string& options = string()); static OutputFormat* newEpeOutputFormat(const string& options = string()); static OutputFormat* newMatxinOutputFormat(const string& options = string()); static OutputFormat* newHorizontalOutputFormat(const string& options = string()); static OutputFormat* newPlaintextOutputFormat(const string& options = string()); static OutputFormat* newVerticalOutputFormat(const string& options = string()); static const string CONLLU_V1; static const string CONLLU_V2; static const string HORIZONTAL_PARAGRAPHS; static const string PLAINTEXT_NORMALIZED_SPACES; static const string VERTICAL_PARAGRAPHS; }; class Model { public: static Model* load(const char* fname); virtual InputFormat* newTokenizer(const string& options) const; virtual bool tag(Sentence& s, const string& options, ProcessingError* error = nullptr) const; virtual bool parse(Sentence& s, const string& options, ProcessingError* error) const; static const string DEFAULT; static const string TOKENIZER_PRESEGMENTED; }; class Pipeline { public: Pipeline(const Model* m, const string& input, const string& tagger, const string& parser, const string& output); void setModel(const Model* m); void setInput(const string& input); void setTagger(const string& tagger); void setParser(const string& parser); void setOutput(const string& output); void setImmediate(bool immediate); void setDocumentId(const string& document_id); string process(const string& data, ProcessingError* error = nullptr) const; static const string DEFAULT; static const string NONE; }; class Trainer { public: static string train(const string& method, const Sentences& train, const Sentences& heldout, const string& tokenizer, const string& tagger, const string& parser, ProcessingError* error = nullptr); static const string DEFAULT; static const string NONE; }; class Evaluator { public: Evaluator(const Model* m, const string& tokenizer, const string& tagger, const string& parser); void setModel(const Model* m); void setTokenizer(const string& tokenizer); void setTagger(const string& tagger); void setParser(const string& parser); string evaluate(const string& data, ProcessingError* error = nullptr) const; static const string DEFAULT; static const string NONE; }; class Version { public: unsigned major; unsigned minor; unsigned patch; string prerelease; // Returns current version. static version current(); };
示例
运行udpipe
简单的管道加载数据(根据请求进行标记)、标记、解析和 写入指定的输出格式:
import sys from ufal.udpipe import Model, Pipeline, ProcessingError # pylint: disable=no-name-in-module # In Python2, wrap sys.stdin and sys.stdout to work with unicode. if sys.version_info[0] < 3: import codecs import locale encoding = locale.getpreferredencoding() sys.stdin = codecs.getreader(encoding)(sys.stdin) sys.stdout = codecs.getwriter(encoding)(sys.stdout) if len(sys.argv) < 4: sys.stderr.write('Usage: %s input_format(tokenize|conllu|horizontal|vertical) output_format(conllu) model_file\n' % sys.argv[0]) sys.exit(1) sys.stderr.write('Loading model: ') model = Model.load(sys.argv[3]) if not model: sys.stderr.write("Cannot load model from file '%s'\n" % sys.argv[3]) sys.exit(1) sys.stderr.write('done\n') pipeline = Pipeline(model, sys.argv[1], Pipeline.DEFAULT, Pipeline.DEFAULT, sys.argv[2]) error = ProcessingError() # Read whole input text = ''.join(sys.stdin.readlines()) # Process data processed = pipeline.process(text, error) if error.occurred(): sys.stderr.write("An error occurred when running run_udpipe: ") sys.stderr.write(error.message) sys.stderr.write("\n") sys.exit(1) sys.stdout.write(processed)
版权和许可
版权所有2016年正式及应用语言学学院 捷克共和国布拉格查尔斯大学数学与物理系。
此源代码表单受mozilla public条款的约束 执照,2.0版。如果MPL的一个副本没有与此一起分发 文件,您可以在http://mozilla.org/MPL/2.0/获得一个。