Skip to content

Commit

Permalink
update source
Browse files Browse the repository at this point in the history
  • Loading branch information
ngocbh committed Nov 16, 2020
1 parent cca2b56 commit d4a714a
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 7 deletions.
26 changes: 20 additions & 6 deletions sctokenizer/source.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from sctokenizer.php_tokenizer import PhpTokenizer

import os
import enum

LANG_MAP = {
'cc': 'cpp',
Expand All @@ -18,14 +19,22 @@ def check_language(lang):
return LANG_MAP[lang]
return lang

class SourceState(enum.Enum):
INIT = 0
UNTOKENIZED = 1
TOKENIZED = 2

class Source():
def __init__(self, source_str, lang=None, name=None):
self.__state = SourceState.INIT

self.source_str = source_str
if lang is None:
self.lang = self.detect_language(self.source_str)
else:
self.lang = check_language(lang)
self.name = name
self.tokens = None

@classmethod
def from_file(cls, filepath, lang=None, name=None):
Expand All @@ -42,7 +51,6 @@ def from_file(cls, filepath, lang=None, name=None):
name = filepath
return Source(source_str, lang, name)


@classmethod
def from_str(cls, source_str, lang=None, name=None):
"""
Expand All @@ -63,24 +71,30 @@ def get_source_str(self):
return self.source_str

def tokenize(self):
if self.__state == SourceState.TOKENIZED:
return self.tokens

if self.lang == 'c':
c_tokenizer = CTokenizer()
return c_tokenizer.tokenize(self.source_str)
self.tokens = c_tokenizer.tokenize(self.source_str)
elif self.lang == 'cpp':
cpp_tokenizer = CppTokenizer()
return cpp_tokenizer.tokenize(self.source_str)
self.tokens = cpp_tokenizer.tokenize(self.source_str)
elif self.lang == 'java':
java_tokenizer = JavaTokenizer()
return java_tokenizer.tokenize(self.source_str)
self.tokens = java_tokenizer.tokenize(self.source_str)
elif self.lang == 'python':
python_tokenizer = PythonTokenizer()
return python_tokenizer.tokenize(self.source_str)
self.tokens = python_tokenizer.tokenize(self.source_str)
elif self.lang == 'php':
php_tokenizer = PhpTokenizer()
return php_tokenizer.tokenize(self.source_str)
self.tokens = php_tokenizer.tokenize(self.source_str)
else:
raise ValueError("Upsupported language")

self.__state = SourceState.TOKENIZED
return self.tokens

@classmethod
def detect_language(cls, source_str):
"""
Expand Down
9 changes: 8 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,19 @@
with open("README.md", "r") as fh:
long_description = fh.read()

PROJECT_URLS = {
'Bug Tracker': 'https://github.com/ngocjr7/sctokenizer/issues',
'Documentation': 'https://github.com/ngocjr7/sctokenizer/blob/master/README.md',
'Source Code': 'https://github.com/ngocjr7/sctokenizer'
}

setup(name='sctokenizer',
description='A Source Code Tokenizer',
author='Ngoc Bui',
long_description=long_description,
long_description_content_type="text/markdown",
project_urls=PROJECT_URLS,
author_email='[email protected]',
version='0.0.2',
version='0.0.5',
packages=find_packages(),
python_requires='>=3.6')

0 comments on commit d4a714a

Please sign in to comment.