IOB2 Corpus Training data generator
A simple generator using the IOB2 Tagged Corpus, IREX Standard
https://github.com/Hironsan/IOB2Corpus
Download file here:
https://gist.github.com/jpena930/0753edfd27e010503755ccfdaeb965bf
Usage:
$ python training_generator
https://github.com/Hironsan/IOB2Corpus
Download file here:
https://gist.github.com/jpena930/0753edfd27e010503755ccfdaeb965bf
Usage:
$ python training_generator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding: utf-8 | |
from __future__ import print_function # Only needed for Python 2 | |
import MeCab | |
import CaboCha | |
import sys | |
import os | |
cabocha = CaboCha.Parser("-f1 -n1") | |
m = MeCab.Tagger ("-Ochasen") | |
# For reading from file | |
class getWords(): | |
def readText(self, filename): | |
###ファイルを展開 | |
with open(filename, 'r', encoding='utf-8') as f: | |
tText = f.read() | |
f.close() | |
return tText | |
#Usage: python training_generator <text file> | |
with open(sys.argv[1], 'r') as my_file: | |
text = my_file.read() | |
getText = getWords() | |
#file_output = '<Filename>' | |
file_output = sys.argv[1] | |
text = getText.readText(file_output) | |
cabocha_text = cabocha.parseToString(text) | |
cabocha_text = cabocha_text.replace("B-ORGANIZATION", "B-ORG") | |
cabocha_text = cabocha_text.replace("I-ORGANIZATION", "I-ORG") | |
cabocha_text = cabocha_text.replace("B-ARTIFACT", "B-ART") | |
cabocha_text = cabocha_text.replace("I-ARTIFACT", "I-ART") | |
cabocha_text = cabocha_text.replace("B-LOCATION", "B-LOC") | |
cabocha_text = cabocha_text.replace("I-LOCATION", "I-LOC") | |
cabocha_text = cabocha_text.replace("B-DATE", "B-DAT") | |
cabocha_text = cabocha_text.replace("I-DATE", "I-DAT") | |
cabocha_text = cabocha_text.replace("B-TIME", "B-TIM") | |
cabocha_text = cabocha_text.replace("I-TIME", "I-TIM") | |
cabocha_text = cabocha_text.replace("B-PERSON", "B-PSN") | |
cabocha_text = cabocha_text.replace("I-PERSON", "I-PSN") | |
cabocha_text = cabocha_text.replace("B-MONEY", "B-MNY") | |
cabocha_text = cabocha_text.replace("I-MONEY", "I-MNY") | |
cabocha_text = cabocha_text.replace("B-PERCENT", "B-PNT") | |
cabocha_text = cabocha_text.replace("I-PERCENT", "I-PNT") | |
#Remove commas and replace with tab | |
cabocha_text = cabocha_text.replace(",", "\t") | |
filename = file_output + '_generated.txt' | |
if os.path.exists(filename): | |
os.remove(filename) | |
# Remove * and add line space | |
for line in cabocha_text.splitlines(): | |
if not line.startswith('*'): | |
with open(filename, 'a') as f: | |
print(line, file=f) | |
if line.startswith('。'): | |
with open(filename, 'a') as f: | |
print("", file=f) | |
readFile = open(filename) | |
lines = readFile.readlines() | |
lines = lines[:-1] | |
readFile.close() | |
w = open(filename,'w') | |
w.writelines([item for item in lines[:-1]]) | |
w.close() |
コメント
コメントを投稿