From 4abf636db5b886baf6097858854e9d88878ef03f Mon Sep 17 00:00:00 2001 From: jeremiahtinker Date: Sat, 4 Mar 2017 15:16:49 -0600 Subject: [PATCH] adding main.py and name_entity.py to a new branch for sentiment analisys. Parsing "libraries" that returns time/location information with LDA assumed sentiment topics --- main.py | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ name_entity.py | 22 +++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 main.py create mode 100644 name_entity.py diff --git a/main.py b/main.py new file mode 100644 index 0000000..a97c67e --- /dev/null +++ b/main.py @@ -0,0 +1,73 @@ +import glob2 +import nltk +import string +import scipy +from collections import Counter +from nltk.corpus import stopwords +from nltk.stem.porter import * +from sklearn.metrics.pairwise import cosine_similarity, linear_kernel +from sklearn.feature_extraction.text import CountVectorizer +import numpy as np # a conventional alias +import sklearn.feature_extraction.text as text +import gensim +from gensim import corpora +from nltk.corpus import stopwords +from nltk.stem.wordnet import WordNetLemmatizer +import string +import re +import dateparser +from name_entity import * +stop = set(stopwords.words('english')) +exclude = set(string.punctuation) +lemma = WordNetLemmatizer() + + +def clean(doc): + stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) + punc_free = ''.join(ch for ch in stop_free if ch not in exclude) + normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split()) + return normalized + + +def main(): + for folder_name in glob2.glob("DentonDD2/20*/"): + folder_items = glob2.glob("%s*.txt" % folder_name) + print folder_name, ":", len(folder_items) + doc_complete = [] + for index, filename in enumerate(glob2.glob("%s*.txt" % folder_name)): + if(not re.findall("(stopword|xfiles)", filename)): + doc_complete.append(open(filename).read()) + doc_clean = [filter(lambda x: len(x) > 2, clean(doc.decode("ascii", "ignore")).split()) + for doc in doc_complete] + dictionary = corpora.Dictionary(doc_clean) + doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean] + Lda = gensim.models.ldamodel.LdaModel + ldamodel = Lda(doc_term_matrix, num_topics=4, + id2word=dictionary, passes=1) + topics = ldamodel.print_topics() + for index, doc in enumerate(doc_clean): + topic_max = None + topic_value = 0 + for topic in topics: + sum_of_terms = 0 + col = Counter(doc) + for word in re.findall("\"(\w+)\"", topic[1]): + if(word in col): + sum_of_terms+=col[word] + if(sum_of_terms >= topic_value): + topic_value =sum_of_terms + topic_max = topic + print topic_max + sentences = nltk.sent_tokenize(doc_complete[index].decode("ascii", "ignore")) + regex = "|".join([word for word in re.findall("\"(\w+)\"", topic_max[1])]) + important_sentences = [sentence for sentence in sentences if re.findall(regex,sentence) ] + print len(important_sentences) + name_entities = set() + date_time = set() + for sentence in important_sentences: + name_entities.update(get_name_entities(sentence)) + date_time.add(dateparser.parse(sentence)) + + +if __name__ == "__main__": + main() diff --git a/name_entity.py b/name_entity.py new file mode 100644 index 0000000..c30cfda --- /dev/null +++ b/name_entity.py @@ -0,0 +1,22 @@ +import nltk + +def extract_entity_names(t): + entity_names = [] + + if hasattr(t, 'label') and t.label: + if t.label() == 'NE': + entity_names.append(' '.join([child[0] for child in t])) + else: + for child in t: + entity_names.extend(extract_entity_names(child)) + + return entity_names +def get_name_entities(sample): + sentences = nltk.sent_tokenize(sample) + tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences] + tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences] + chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True) + entity_names = [] + for tree in chunked_sentences: + entity_names.extend(extract_entity_names(tree)) + return set(entity_names)