From d4a714a25499c68ad437d3825f2d464800322bf4 Mon Sep 17 00:00:00 2001 From: Ngoc Bui Date: Mon, 16 Nov 2020 20:07:15 +0700 Subject: [PATCH] update source --- sctokenizer/source.py | 26 ++++++++++++++++++++------ setup.py | 9 ++++++++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/sctokenizer/source.py b/sctokenizer/source.py index ff78ce6..806d609 100644 --- a/sctokenizer/source.py +++ b/sctokenizer/source.py @@ -7,6 +7,7 @@ from sctokenizer.php_tokenizer import PhpTokenizer import os +import enum LANG_MAP = { 'cc': 'cpp', @@ -18,14 +19,22 @@ def check_language(lang): return LANG_MAP[lang] return lang +class SourceState(enum.Enum): + INIT = 0 + UNTOKENIZED = 1 + TOKENIZED = 2 + class Source(): def __init__(self, source_str, lang=None, name=None): + self.__state = SourceState.INIT + self.source_str = source_str if lang is None: self.lang = self.detect_language(self.source_str) else: self.lang = check_language(lang) self.name = name + self.tokens = None @classmethod def from_file(cls, filepath, lang=None, name=None): @@ -42,7 +51,6 @@ def from_file(cls, filepath, lang=None, name=None): name = filepath return Source(source_str, lang, name) - @classmethod def from_str(cls, source_str, lang=None, name=None): """ @@ -63,24 +71,30 @@ def get_source_str(self): return self.source_str def tokenize(self): + if self.__state == SourceState.TOKENIZED: + return self.tokens + if self.lang == 'c': c_tokenizer = CTokenizer() - return c_tokenizer.tokenize(self.source_str) + self.tokens = c_tokenizer.tokenize(self.source_str) elif self.lang == 'cpp': cpp_tokenizer = CppTokenizer() - return cpp_tokenizer.tokenize(self.source_str) + self.tokens = cpp_tokenizer.tokenize(self.source_str) elif self.lang == 'java': java_tokenizer = JavaTokenizer() - return java_tokenizer.tokenize(self.source_str) + self.tokens = java_tokenizer.tokenize(self.source_str) elif self.lang == 'python': python_tokenizer = PythonTokenizer() - return python_tokenizer.tokenize(self.source_str) + self.tokens = python_tokenizer.tokenize(self.source_str) elif self.lang == 'php': php_tokenizer = PhpTokenizer() - return php_tokenizer.tokenize(self.source_str) + self.tokens = php_tokenizer.tokenize(self.source_str) else: raise ValueError("Upsupported language") + self.__state = SourceState.TOKENIZED + return self.tokens + @classmethod def detect_language(cls, source_str): """ diff --git a/setup.py b/setup.py index d7ea7f4..aa195f6 100644 --- a/setup.py +++ b/setup.py @@ -3,12 +3,19 @@ with open("README.md", "r") as fh: long_description = fh.read() +PROJECT_URLS = { + 'Bug Tracker': 'https://github.com/ngocjr7/sctokenizer/issues', + 'Documentation': 'https://github.com/ngocjr7/sctokenizer/blob/master/README.md', + 'Source Code': 'https://github.com/ngocjr7/sctokenizer' +} + setup(name='sctokenizer', description='A Source Code Tokenizer', author='Ngoc Bui', long_description=long_description, long_description_content_type="text/markdown", + project_urls=PROJECT_URLS, author_email='ngocjr7@gmail.com', - version='0.0.2', + version='0.0.5', packages=find_packages(), python_requires='>=3.6')