Skip to content

Latest commit

 

History

History
40 lines (29 loc) · 1.59 KB

README.md

File metadata and controls

40 lines (29 loc) · 1.59 KB

pyokt

open-korean-text wrapper for python

Install

pip install pyokt

For windows users: Please install appropriate jpype1 wheel from http://www.lfd.uci.edu/~gohlke/pythonlibs/

Example

import pyokt

text = "나는 집에 가서 잠을 자고 싶어. 너도 그러닠ㅋㅋㅋㅋㅋㅋㅋ"

# Normalization
pyokt.normalize(text)
>>> 나는 집에 가서 잠을 자고 싶어. 너도 그러니ㅋㅋㅋ

# Tokenization
pyokt.tokenize(text, nomalize=True, stem=True)
>>> [(Noun: 0, 1), (Josa: 1, 1),  (Space: 2, 1), (Noun: 3, 1), (Josa: 4, 1),  (Space: 5, 1), 가다(Verb: 6, 2),  (Space: 8, 1), (Noun: 9, 1), (Josa: 10, 1),  (Space: 11, 1), 자고(Noun: 12, 2),  (Space: 14, 1), 싶다(Verb: 15, 2), .(Punctuation: 17, 1),  (Space: 18, 1), (Noun: 19, 1), (Josa: 20, 1),  (Space: 21, 1), 그렇다(Adjective: 22, 3), ㅋㅋㅋ(KoreanParticle: 25, 3)]

# Split sentences
pyokt.split_sentences(text)
>>> ['나는 집에 가서 잠을 자고 싶어.', '너도 그러닠ㅋㅋㅋㅋㅋㅋㅋ']

# Add nouns to the dictionary
pyokt.tokenize("군단숙주가 나타났다")
>>> [군단(Noun: 0, 2), 숙주(Noun: 2, 2), (Josa: 4, 1),  (Space: 5, 1), 나타났다(Verb: 6, 4)]
pyokt.add_nouns_to_dictionary(['군단숙주'])
pyokt.tokenize("군단숙주가 나타났다")
>>> [군단숙주(Noun: 0, 4), (Josa: 4, 1),  (Space: 5, 1), 나타났다(Verb: 6, 4)]

# Phrase extraction
pyokt.extract_phrases("깊은 잠에 빠지고 말았다.")
>>> [깊은 (Noun: 0, 4)]