diff --git a/README.md b/README.md index 6246d10..2a8788a 100644 --- a/README.md +++ b/README.md @@ -52,5 +52,10 @@ Programming projects that I have written at university - [Project 1](/cmpe362/project1) `Matlab` - [Project 2](/cmpe362/project2) `Matlab` - [Project 3](/cmpe362/project3) `Matlab` +- +### [CmpE493 - Introduction to Information Retrieval](/cmpe493) +- [Project 1](/cmpe493/project1) `Python` +- [Project 2](/cmpe493/project2) `Python` +- [Project 3](/cmpe493/project3) `Python` ### [CmpE352 & CmpE451 - Project Development in Software Engineering](https://github.com/bounswe/bounswe2017group11) diff --git a/cmpe493/README.md b/cmpe493/README.md new file mode 100644 index 0000000..9de438b --- /dev/null +++ b/cmpe493/README.md @@ -0,0 +1,4 @@ +# CmpE493 - Introduction to Information Retrieval +| Term | Instructor | +| --- | --- | +| Spring 2018 | Arzucan Özgür | \ No newline at end of file diff --git a/cmpe493/project1/README.md b/cmpe493/project1/README.md new file mode 100644 index 0000000..84d9acc --- /dev/null +++ b/cmpe493/project1/README.md @@ -0,0 +1,13 @@ +Entry point of my application is app.py + +You should use python3 + +Run my app with "python3 app.py" command. + +If you run it first time, it creates inverted index. + +Default directory name is "reuters21578" for data. + +Stop word data have to be in current directory and named as "stopwords.txt" + +You can use help command for further information \ No newline at end of file diff --git a/cmpe493/project1/Report.pdf b/cmpe493/project1/Report.pdf new file mode 100644 index 0000000..3b0417d Binary files /dev/null and b/cmpe493/project1/Report.pdf differ diff --git a/cmpe493/project1/app.py b/cmpe493/project1/app.py new file mode 100644 index 0000000..ef28704 --- /dev/null +++ b/cmpe493/project1/app.py @@ -0,0 +1,115 @@ +import os +import re +from indexer import Indexer +from query import Query +from query import QueryType + +def make_bold(string): + """ + Makes given string bold for terminal with escape characters + """ + return "\033[1m{}\033[0m".format(string) + +def command_help(): + """ + Prints help page of the application to terminal + """ + print("\n\n\t\t\t\t\t" + make_bold("--- REUTERS SEARCH ENGINE ---")) + print(make_bold("COMMANDS")) + print("\t" + make_bold("exit") + "\t\t Exits from program") + print("\t" + make_bold("help") + "\t\t Lists available commands") + print("\t" + make_bold("index [FOLDER]") + "\t Indexes document positional inverted index. Default: reuters21578") + print("\t" + make_bold("clear") + "\t\t Clear console screen") + print("\t" + make_bold("remove") + "\t\t Removes inverted index files") + print("\t" + make_bold("postings WORD") + "\t Returns postings of word") + print("\n\t" + "** There is no special command for query processing.") + print("\t" + "Inputs that aren't special command interpreted as query") + print("\n\t" + make_bold("[QUERY_TYPE] YOUR_QUERY") + "\t Processes query based on given type.") + print("\t\t\t\t If no type is given, it predicts query type") + print("\t" + make_bold("Query Types")) + print("\t\t" + make_bold(QueryType.CONJUNCTIVE) + " -> Conjunctive Query") + print("\t\t" + make_bold(QueryType.PHRASE) + " -> Phrase Query") + print("\t\t" + make_bold(QueryType.PROXIMITY) + " -> Proximity Query") + print("\n\n\n") + +def command_index(directory): + """ + Indexes data that from given directory again + """ + global dictionary + global index + Indexer.remove_index() + # Set default data directory + if directory is None: + directory = 'reuters21578' + print('Indexing ' + directory + ' folder...') + Indexer.create_index(directory=directory) + dictionary, index = Indexer.get_index() + print('Index created') + +def command_remove(): + """ + Removes current index files + """ + Indexer.remove_index() + global dictionary + global index + dictionary = {} + index = {} + print('Index removed') + +def command_postings(word, dictionary, index): + """ + Returns postings of given word + """ + postings = Indexer.get_postings(word, dictionary, index) + print(postings) + +def command_exit(): + """ + Exits from application + """ + print("Goodbye...") + exit(1) + +def command_clear(): + """ + Clears terminal screen + """ + os.system("clear") + +#################################### +########## APP START HERE ########## +#################################### + +# If the index isn't created create it +if not Indexer.is_indexed(): + command_index(None) +else: + print('Data is already indexed') + +dictionary, index = Indexer.get_index() + +print("Type " + make_bold("help") + " for any documentation") +while True: + # Get command from user and processes it + command = input("query> ") + postings_command = re.match(r'^postings\s(\w+)', command) + index_command = re.match(r'^index\s?(\w+)?', command) + if command == "exit": + command_exit() + elif index_command: + command_index(index_command.group(1)) + elif command == "help": + command_help() + elif command == "clear": + command_clear() + elif command == "remove": + command_remove() + elif postings_command: + command_postings(postings_command.group(1), dictionary, index) + else: + query = Query(command) + result = query.run(dictionary, index) + print(make_bold(str(len(result)) + ' documents are founded')) + print(sorted(result)) diff --git a/cmpe493/project1/cmpe493-assignment1-specification.pdf b/cmpe493/project1/cmpe493-assignment1-specification.pdf new file mode 100644 index 0000000..97f42e0 Binary files /dev/null and b/cmpe493/project1/cmpe493-assignment1-specification.pdf differ diff --git a/cmpe493/project1/indexer.py b/cmpe493/project1/indexer.py new file mode 100644 index 0000000..b546e02 --- /dev/null +++ b/cmpe493/project1/indexer.py @@ -0,0 +1,142 @@ +import os, re, pickle +from tokenizer import Tokenizer + +class Indexer: + """ + Handles inverted index operations + """ + DICTIONARY_NAME = 'dictionary.txt' # Name of the dictionary file + INDEX_NAME = 'inverted_index.txt' # Name of the inverted index file + POSTING_ID = 1 # Starting ID for posting lists + + @classmethod + def read_files(self, directory=None): + """ + Returns read documents from data directory + """ + # If no directory is given, set it to current directory + directory = os.getcwd() if directory is None else directory + filenames = os.listdir(directory) + # Get all file with .sgm extension + filenames = [filename for filename in filenames if filename.endswith(".sgm")] + filenames.sort() + documents = [] + # Extract documents from each file + for filename in filenames: + raw_data = open(os.path.join(directory, filename), "r", encoding="latin-1").read() + documents += self.extract_documents(raw_data) + return documents + + @classmethod + def extract_documents(self, raw_data): + """ + Extracts documents from raw string + """ + # Some news don't have body or title + # return re.findall(r'\d+)\">.*?(?P<title>.*?).*?(?P.*?).*?', raw_data, re.DOTALL) + documents = [] + # Seperate each document + raw_documents = raw_data.split('') + # Extract information from each raw document string + for raw_document in raw_documents: + doc_id = re.match(r'.+?NEWID=\"(?P\d+)\">.+?', raw_document, re.DOTALL) + doc_title = re.match(r'.+?(?P<title>.+?).+?', raw_document, re.DOTALL) + doc_body = re.match(r'.+?(?P.+?).+?', raw_document, re.DOTALL) + + # If raw corpus has ID, it's a document, add it to list + if doc_id: + doc_id = int(doc_id.group('id')) + # If it's not have title or body, put empty string instead of them + doc_title = doc_title.group('title') if doc_title else '' + doc_body = doc_body.group('body') if doc_body else '' + documents.append({'id': doc_id, 'title': doc_title, 'body':doc_body}) + return documents + + @classmethod + def create_index(self, directory=None): + """ + Creates index from data that in given directory + """ + # Read files and get documents + documents = self.read_files(directory) + # Initialize directory and inverted index + dictionary = {} + inverted_index = {} + # Load stop words from file + stop_words = Tokenizer.stop_words() + + for document in documents: + doc_id = document['id'] + # Concatenate title and body, then tokenize this combination + tokens = Tokenizer.tokenize(document['title'] + ' ' + document['body']) + # Iterate all tokens and if it's not a stop word, add it to index with it's position + for position, token in enumerate(tokens): + if not token in stop_words: + # Get ID of positional indexes of the token + postings_id = dictionary.get(token, self.get_posting_id()) + # Get positional indexes of token as dictionary + postings = inverted_index.get(postings_id, {}) + # Get positions of the token in the document as list + positions = postings.get(doc_id, []) + # Add this position to positional index + positions.append(position) + # Put positions list of the this document back to token's document's list + postings[doc_id] = positions + # Put updated positional indexes of the token back to inverted index + inverted_index[postings_id] = postings + # Update ID of the token in dictionary + dictionary[token] = postings_id + # Save created index to file + self.save_index(dictionary, inverted_index) + + @classmethod + def get_posting_id(self): + """ + Returns globally incremented ID for next postings list + """ + self.POSTING_ID += 1 + return self.POSTING_ID - 1 + + @classmethod + def get_postings(self, token, dictionary, index): + """ + Returns documents and positions of given token after normalization + """ + stem = Tokenizer.normalize_and_stem(token) + posting_id = dictionary.get(stem) + return index.get(posting_id, {}) + + @classmethod + def save_index(self, directory, index): + """ + Save dictionary and inverted index to file + """ + pickle.dump(directory, open(self.DICTIONARY_NAME, 'wb')) + pickle.dump(index, open(self.INDEX_NAME, 'wb')) + + @classmethod + def get_index(self): + """ + Load dictionary and inverted index from file + Returns: + dictionary, index + """ + return pickle.load(open(self.DICTIONARY_NAME, 'rb')), pickle.load(open(self.INDEX_NAME, 'rb')) + + @classmethod + def remove_index(self): + """ + Removes old inverted index files + """ + try: + os.remove(self.DICTIONARY_NAME) + os.remove(self.INDEX_NAME) + except OSError: + pass + + @classmethod + def is_indexed(self): + """ + Checks if index is exist + """ + return os.path.isfile(self.DICTIONARY_NAME) and os.path.isfile(self.INDEX_NAME) \ No newline at end of file diff --git a/cmpe493/project1/query.py b/cmpe493/project1/query.py new file mode 100644 index 0000000..63f5412 --- /dev/null +++ b/cmpe493/project1/query.py @@ -0,0 +1,216 @@ +import re +from enum import Enum +from tokenizer import Tokenizer + +class Query: + """ + Handles query operations + """ + + def __init__(self, query): + """ + Constructs an new query with given one + """ + self.type, self.query = Query.extract(query) + + @staticmethod + def extract(query): + """ + Returns query's text and it's type + If no type is given, it guesses type of the query + Returns: + type, query + """ + type = re.match(r'^\d+', query) + if type: + # Get type of the query from text and return with query + return int(type.group()), re.sub(r'^\d+\s', '', query) + else: + # Guess type of the query + # If it has AND it's conjunctive + # If it has /NUMBER it's proximity + # Otherwise, it's phrase + if 'AND' in query: + return QueryType.CONJUNCTIVE, query + elif re.match(r'.*?/\d+.*?', query): + return QueryType.PROXIMITY, query + else: + return QueryType.PHRASE, query + + def run(self, dictionary, index): + """ + Runs query depends on it's type + Phrase queries are same with proximity queries. + Just replace spaces with /0 + """ + result = [] + if self.type == QueryType.CONJUNCTIVE: + result = self.run_conjunctive(dictionary, index) + elif self.type == QueryType.PHRASE: + self.query = self.query.replace(' ', ' /0 ') + result = self.run_proximity(dictionary, index) + elif self.type == QueryType.PROXIMITY: + result = self.run_proximity(dictionary, index) + else: + print("Unknown query type") + return result + + def run_conjunctive(self, dictionary, index): + """ + Runs conjunctive query + """ + # Tokenize and normalize query + tokens = self.query.split(' AND ') + stems = [Tokenizer.normalize_and_stem(token) for token in tokens] + # Get inverted indexies of all stems + postings_list = [] + for stem in stems: + posting_id = dictionary.get(stem) + postings = index.get(posting_id, {}) + postings_list.append(postings) + # Intersect given lists + return self.intersect_list(postings_list) + + def run_proximity(self, dictionary, index): + """ + Runs proximity query + """ + result = [] + # Tokenize and normalize query + tokens = re.split(r'\s\/\d+\s', self.query) + stems = [Tokenizer.normalize_and_stem(token) for token in tokens] + # Get proximities + proximities = [int(proximity) for proximity in re.findall(r'\d+', self.query)] + # Get inverted indexies of all stems + postings_list = [] + for stem in stems: + posting_id = dictionary.get(stem) + postings = index.get(posting_id, {}) + postings_list.append(postings) + # Intersect positionally given lists + return self.positional_intersect_list(postings_list, proximities) + + def intersect(self, ps1, ps2): + """ + Intersects two given lists + Algorithm is based on Figure 1.6 from book + """ + answer = [] + p1 = next(ps1) + p2 = next(ps2) + while True: + try: + if p1 == p2: + # If document's IDs are same add to answers + answer.append(p1) + p1 = next(ps1) + p2 = next(ps2) + elif p1 < p2: + p1 = next(ps1) + else: + p2 = next(ps2) + except StopIteration: + break + return answer + + def intersect_list(self, postings_list): + """ + Intersects multiple lists + Algorithm is based on Figure 1.7 from book + """ + # Sort them by their frequencies + postings_list.sort(key=lambda postings: len(postings)) + + # Intersect lists 2 by 2 + result = postings_list.pop(0) + while result and postings_list: + try: + postings = postings_list.pop(0) + result = self.intersect(iter(result), iter(postings)) + except IndexError: + break + if isinstance(result, dict): + result = list(result.keys()) + return result + + def positional_intersect_list(self, postings_list, proximities): + """ + Intersects positionally multiple lists + Intersect next list with previous answer + """ + start_postings = postings_list.pop(0) + result = start_postings.keys() + while postings_list and proximities: + try: + next_postings = postings_list.pop(0) + k = proximities.pop(0) + result, start_postings = self.positional_intersect(start_postings, next_postings, k) + if not result: + break + except IndexError: + break + return result + + def positional_intersect(self, ps1, ps2, k): + """ + Intersects two given lists by proximity + Algorithm is based on Figure 2.12 from book + When you said next to last element, + Python throws and exception instead of just return None + So my method has too many try:except blocks + """ + + k = k + 1 + answer = set() + postings = {} + ps1_iter = iter(ps1) + ps2_iter = iter(ps2) + try: + p1 = next(ps1_iter) + p2 = next(ps2_iter) + except StopIteration: + return answer, postings + while True: + try: + if p1 == p2: + l = [] + p1_positions = iter(ps1[p1]) + p2_positions = iter(ps2[p2]) + pp1 = next(p1_positions) + pp2 = next(p2_positions) + while True: + try: + while True: + try: + if pp2 - pp1 <= k and pp2 - pp1 > 0: + l.append(pp2) + elif pp2 > pp1: + break + pp2 = next(p2_positions) + except StopIteration: + break + for ps in l: + answer.add(p1) + positions = postings.get(p1, []) + positions.append(ps) + postings[p1] = positions + pp1 = next(p1_positions) + except StopIteration: + break + p1 = next(ps1_iter) + p2 = next(ps2_iter) + elif p1 < p2: + p1 = next(ps1_iter) + else: + p2 = next(ps2_iter) + except StopIteration: + break + return answer, postings + +class QueryType: + """ + Enumerates type of the query + """ + CONJUNCTIVE = 1 + PHRASE = 2 + PROXIMITY = 3 diff --git a/cmpe493/project1/report.py b/cmpe493/project1/report.py new file mode 100644 index 0000000..a6849d7 --- /dev/null +++ b/cmpe493/project1/report.py @@ -0,0 +1,79 @@ +import os +import re +from indexer import Indexer +from tokenizer import Tokenizer +import operator + +documents = Indexer.read_files("original_data") + +countA = 0 +countB = 0 + +setC = set() +setD = set() + +counterE = {} +counterF = {} + +for document in documents: + countA += len(document['title'].split()) + countA += len(document['body'].split()) + + countB += len(Tokenizer.remove_stop_words(document['title']).split()) + countB += len(Tokenizer.remove_stop_words(document['body']).split()) + + setC |= set(document['title'].split()) + setC |= set(document['body'].split()) + + setD |= set([ + Tokenizer.stem(token) for token in Tokenizer.remove_stop_words( Tokenizer.make_lower_case(document['title']) ).split() + ]) + + setD |= set([ + Tokenizer.stem(token) for token in Tokenizer.remove_stop_words(Tokenizer.make_lower_case(document['body'])).split() + ]) + + for term in document['title'].split(): + count = counterE.get(term, 0) + count += 1 + counterE[term] = count + + for term in document['body'].split(): + count = counterE.get(term, 0) + count += 1 + counterE[term] = count + + for term in [Tokenizer.stem(token) for token in Tokenizer.remove_stop_words(Tokenizer.make_lower_case(document['title'])).split()]: + count = counterF.get(term, 0) + count += 1 + counterF[term] = count + + for term in [Tokenizer.stem(token) for token in Tokenizer.remove_stop_words(Tokenizer.make_lower_case(document['body'])).split()]: + count = counterF.get(term, 0) + count += 1 + counterF[term] = count + +''' + PRINT RESULTS +''' +print("\n(a) How many tokens does the corpus contain before stopword removal and stemming?") +print(countA) + +print("\n(b) How many tokens does the corpus contain after stopword removal and stemming?") +print(countB) + +print("\n(c) How many terms(unique tokens) are there before stopword removal, stemming, and case - folding?") +print(len(setC)) + +print("\n(d) How many terms(unique tokens) are there after stopword removal, stemming, and casefolding?") +print(len(setD)) + +print("\n(e) List the top 20 most frequent terms before stopword removal, stemming, and casefolding?") +tops = list(sorted(counterE.items(), key=operator.itemgetter(1), reverse=True)) +for i in range(0, 20): + print(tops[i]) + +print("\n(f) List the top 20 most frequent terms after stopword removal, stemming, and case - folding?") +tops = list(sorted(counterF.items(), key=operator.itemgetter(1), reverse=True)) +for i in range(0, 20): + print(tops[i]) diff --git a/cmpe493/project1/stemmer.py b/cmpe493/project1/stemmer.py new file mode 100644 index 0000000..aed41e4 --- /dev/null +++ b/cmpe493/project1/stemmer.py @@ -0,0 +1,407 @@ +#!/usr/bin/env python + +"""Porter Stemming Algorithm +This is the Porter stemming algorithm, ported to Python from the +version coded up in ANSI C by the author. It may be be regarded +as canonical, in that it follows the algorithm presented in + +Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, +no. 3, pp 130-137, + +only differing from it at the points maked --DEPARTURE-- below. + +See also http://www.tartarus.org/~martin/PorterStemmer + +The algorithm as described in the paper could be exactly replicated +by adjusting the points of DEPARTURE, but this is barely necessary, +because (a) the points of DEPARTURE are definitely improvements, and +(b) no encoding of the Porter stemmer I have seen is anything like +as exact as this version, even with the points of DEPARTURE! + +Vivake Gupta (v@nano.com) + +Release 1: January 2001 + +Further adjustments by Santiago Bruno (bananabruno@gmail.com) +to allow word input not restricted to one word per line, leading +to: + +release 2: July 2008 +""" + +import sys + + +class PorterStemmer: + + def __init__(self): + """The main part of the stemming algorithm starts here. + b is a buffer holding a word to be stemmed. The letters are in b[k0], + b[k0+1] ... ending at b[k]. In fact k0 = 0 in this demo program. k is + readjusted downwards as the stemming progresses. Zero termination is + not in fact used in the algorithm. + + Note that only lower case sequences are stemmed. Forcing to lower case + should be done before stem(...) is called. + """ + + self.b = "" # buffer for word to be stemmed + self.k = 0 + self.k0 = 0 + self.j = 0 # j is a general offset into the string + + def cons(self, i): + """cons(i) is TRUE <=> b[i] is a consonant.""" + if self.b[i] == 'a' or self.b[i] == 'e' or self.b[i] == 'i' or self.b[i] == 'o' or self.b[i] == 'u': + return 0 + if self.b[i] == 'y': + if i == self.k0: + return 1 + else: + return (not self.cons(i - 1)) + return 1 + + def m(self): + """m() measures the number of consonant sequences between k0 and j. + if c is a consonant sequence and v a vowel sequence, and <..> + indicates arbitrary presence, + + gives 0 + vc gives 1 + vcvc gives 2 + vcvcvc gives 3 + .... + """ + n = 0 + i = self.k0 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + while 1: + while 1: + if i > self.j: + return n + if self.cons(i): + break + i = i + 1 + i = i + 1 + n = n + 1 + while 1: + if i > self.j: + return n + if not self.cons(i): + break + i = i + 1 + i = i + 1 + + def vowelinstem(self): + """vowelinstem() is TRUE <=> k0,...j contains a vowel""" + for i in range(self.k0, self.j + 1): + if not self.cons(i): + return 1 + return 0 + + def doublec(self, j): + """doublec(j) is TRUE <=> j,(j-1) contain a double consonant.""" + if j < (self.k0 + 1): + return 0 + if (self.b[j] != self.b[j - 1]): + return 0 + return self.cons(j) + + def cvc(self, i): + """cvc(i) is TRUE <=> i-2,i-1,i has the form consonant - vowel - consonant + and also if the second c is not w,x or y. this is used when trying to + restore an e at the end of a short e.g. + + cav(e), lov(e), hop(e), crim(e), but + snow, box, tray. + """ + if i < (self.k0 + 2) or not self.cons(i) or self.cons(i - 1) or not self.cons(i - 2): + return 0 + ch = self.b[i] + if ch == 'w' or ch == 'x' or ch == 'y': + return 0 + return 1 + + def ends(self, s): + """ends(s) is TRUE <=> k0,...k ends with the string s.""" + length = len(s) + if s[length - 1] != self.b[self.k]: # tiny speed-up + return 0 + if length > (self.k - self.k0 + 1): + return 0 + if self.b[self.k - length + 1:self.k + 1] != s: + return 0 + self.j = self.k - length + return 1 + + def setto(self, s): + """setto(s) sets (j+1),...k to the characters in the string s, readjusting k.""" + length = len(s) + self.b = self.b[:self.j + 1] + s + self.b[self.j + length + 1:] + self.k = self.j + length + + def r(self, s): + """r(s) is used further down.""" + if self.m() > 0: + self.setto(s) + + def step1ab(self): + """step1ab() gets rid of plurals and -ed or -ing. e.g. + + caresses -> caress + ponies -> poni + ties -> ti + caress -> caress + cats -> cat + + feed -> feed + agreed -> agree + disabled -> disable + + matting -> mat + mating -> mate + meeting -> meet + milling -> mill + messing -> mess + + meetings -> meet + """ + if self.b[self.k] == 's': + if self.ends("sses"): + self.k = self.k - 2 + elif self.ends("ies"): + self.setto("i") + elif self.b[self.k - 1] != 's': + self.k = self.k - 1 + if self.ends("eed"): + if self.m() > 0: + self.k = self.k - 1 + elif (self.ends("ed") or self.ends("ing")) and self.vowelinstem(): + self.k = self.j + if self.ends("at"): + self.setto("ate") + elif self.ends("bl"): + self.setto("ble") + elif self.ends("iz"): + self.setto("ize") + elif self.doublec(self.k): + self.k = self.k - 1 + ch = self.b[self.k] + if ch == 'l' or ch == 's' or ch == 'z': + self.k = self.k + 1 + elif (self.m() == 1 and self.cvc(self.k)): + self.setto("e") + + def step1c(self): + """step1c() turns terminal y to i when there is another vowel in the stem.""" + if (self.ends("y") and self.vowelinstem()): + self.b = self.b[:self.k] + 'i' + self.b[self.k + 1:] + + def step2(self): + """step2() maps double suffices to single ones. + so -ization ( = -ize plus -ation) maps to -ize etc. note that the + string before the suffix must give m() > 0. + """ + if self.b[self.k - 1] == 'a': + if self.ends("ational"): + self.r("ate") + elif self.ends("tional"): + self.r("tion") + elif self.b[self.k - 1] == 'c': + if self.ends("enci"): + self.r("ence") + elif self.ends("anci"): + self.r("ance") + elif self.b[self.k - 1] == 'e': + if self.ends("izer"): + self.r("ize") + elif self.b[self.k - 1] == 'l': + if self.ends("bli"): + self.r("ble") # --DEPARTURE-- + # To match the published algorithm, replace this phrase with + # if self.ends("abli"): self.r("able") + elif self.ends("alli"): + self.r("al") + elif self.ends("entli"): + self.r("ent") + elif self.ends("eli"): + self.r("e") + elif self.ends("ousli"): + self.r("ous") + elif self.b[self.k - 1] == 'o': + if self.ends("ization"): + self.r("ize") + elif self.ends("ation"): + self.r("ate") + elif self.ends("ator"): + self.r("ate") + elif self.b[self.k - 1] == 's': + if self.ends("alism"): + self.r("al") + elif self.ends("iveness"): + self.r("ive") + elif self.ends("fulness"): + self.r("ful") + elif self.ends("ousness"): + self.r("ous") + elif self.b[self.k - 1] == 't': + if self.ends("aliti"): + self.r("al") + elif self.ends("iviti"): + self.r("ive") + elif self.ends("biliti"): + self.r("ble") + elif self.b[self.k - 1] == 'g': # --DEPARTURE-- + if self.ends("logi"): + self.r("log") + # To match the published algorithm, delete this phrase + + def step3(self): + """step3() dels with -ic-, -full, -ness etc. similar strategy to step2.""" + if self.b[self.k] == 'e': + if self.ends("icate"): + self.r("ic") + elif self.ends("ative"): + self.r("") + elif self.ends("alize"): + self.r("al") + elif self.b[self.k] == 'i': + if self.ends("iciti"): + self.r("ic") + elif self.b[self.k] == 'l': + if self.ends("ical"): + self.r("ic") + elif self.ends("ful"): + self.r("") + elif self.b[self.k] == 's': + if self.ends("ness"): + self.r("") + + def step4(self): + """step4() takes off -ant, -ence etc., in context vcvc.""" + if self.b[self.k - 1] == 'a': + if self.ends("al"): + pass + else: + return + elif self.b[self.k - 1] == 'c': + if self.ends("ance"): + pass + elif self.ends("ence"): + pass + else: + return + elif self.b[self.k - 1] == 'e': + if self.ends("er"): + pass + else: + return + elif self.b[self.k - 1] == 'i': + if self.ends("ic"): + pass + else: + return + elif self.b[self.k - 1] == 'l': + if self.ends("able"): + pass + elif self.ends("ible"): + pass + else: + return + elif self.b[self.k - 1] == 'n': + if self.ends("ant"): + pass + elif self.ends("ement"): + pass + elif self.ends("ment"): + pass + elif self.ends("ent"): + pass + else: + return + elif self.b[self.k - 1] == 'o': + if self.ends("ion") and (self.b[self.j] == 's' or self.b[self.j] == 't'): + pass + elif self.ends("ou"): + pass + # takes care of -ous + else: + return + elif self.b[self.k - 1] == 's': + if self.ends("ism"): + pass + else: + return + elif self.b[self.k - 1] == 't': + if self.ends("ate"): + pass + elif self.ends("iti"): + pass + else: + return + elif self.b[self.k - 1] == 'u': + if self.ends("ous"): + pass + else: + return + elif self.b[self.k - 1] == 'v': + if self.ends("ive"): + pass + else: + return + elif self.b[self.k - 1] == 'z': + if self.ends("ize"): + pass + else: + return + else: + return + if self.m() > 1: + self.k = self.j + + def step5(self): + """step5() removes a final -e if m() > 1, and changes -ll to -l if + m() > 1. + """ + self.j = self.k + if self.b[self.k] == 'e': + a = self.m() + if a > 1 or (a == 1 and not self.cvc(self.k - 1)): + self.k = self.k - 1 + if self.b[self.k] == 'l' and self.doublec(self.k) and self.m() > 1: + self.k = self.k - 1 + + def stem(self, p, i, j): + """In stem(p,i,j), p is a char pointer, and the string to be stemmed + is from p[i] to p[j] inclusive. Typically i is zero and j is the + offset to the last character of a string, (p[j+1] == '\0'). The + stemmer adjusts the characters p[i] ... p[j] and returns the new + end-point of the string, k. Stemming never increases word length, so + i <= k <= j. To turn the stemmer into a module, declare 'stem' as + extern, and delete the remainder of this file. + """ + # copy the parameters into statics + self.b = p + self.k = j + self.k0 = i + if self.k <= self.k0 + 1: + return self.b # --DEPARTURE-- + + # With this line, strings of length 1 or 2 don't go through the + # stemming process, although no mention is made of this in the + # published algorithm. Remove the line to match the published + # algorithm. + + self.step1ab() + self.step1c() + self.step2() + self.step3() + self.step4() + self.step5() + return self.b[self.k0:self.k + 1] \ No newline at end of file diff --git a/cmpe493/project1/stopwords.txt b/cmpe493/project1/stopwords.txt new file mode 100644 index 0000000..1b5e1f4 --- /dev/null +++ b/cmpe493/project1/stopwords.txt @@ -0,0 +1,54 @@ +a +all +an +and +any +are +as +be +been +but +by +few +for +have +he +her +here +him +his +how +i +in +is +it +its +many +me +my +none +of +on +or +our +she +some +the +their +them +there +they +that +this +us +was +what +when +where +which +who +why +will +with +you +your diff --git a/cmpe493/project1/tokenizer.py b/cmpe493/project1/tokenizer.py new file mode 100644 index 0000000..5aa41db --- /dev/null +++ b/cmpe493/project1/tokenizer.py @@ -0,0 +1,102 @@ +import re +from string import punctuation, digits +from stemmer import PorterStemmer + +class Tokenizer: + """ + Handles tokenize operations + """ + + # Global stemmer instance of Tokenizer class + stemmer = PorterStemmer() + + @staticmethod + def tokenize(text): + """ + Tokenizes given text after normalizing and stemming + """ + # Normalize given text + text = Tokenizer.normalize(text) + # Split text from whitespaces + words = text.split() + # Stem all tokens and return list + return list(map(Tokenizer.stem, words)) + + @staticmethod + def make_lower_case(text): + """ + Makes given text lower case + """ + return text.lower() + + @staticmethod + def remove_stop_words(text): + """ + Removes stop words from given text + """ + # Get stop word list from file + stop_words = Tokenizer.stop_words() + # Replace stop words with empty string + remove_list_regex = re.compile(r'\b|\b'.join(map(re.escape, stop_words))) + return remove_list_regex.sub('', text) + + @staticmethod + def stop_words(): + """ + Returns stop word list from text file + """ + return open('stopwords.txt').read().split() + + @staticmethod + def remove_extra_whitespaces(text): + """ + Removes extra whitespaces from given text such as multiple adjencent space + """ + return re.sub(r'\s+', ' ', text).strip() + + @staticmethod + def remove_punctuation(text): + """ + Removes punctuations from given text + """ + # Replace punctuation with space instead of remove it for hand-to-mouth, six-week-old, euro-certificate + return text.translate(str.maketrans(punctuation, ' ' * len(punctuation))) + + @staticmethod + def remove_digits(text): + """ + Removes digits from given text + """ + return text.translate(str.maketrans('', '', digits)) + + @staticmethod + def normalize(text): + """ + Normalizes given text + Steps: + 1. Make lowercase + 2. Remove punctuation + 3. Remove digit + 4. Remove extra whitespace + """ + text = Tokenizer.make_lower_case(text) + # text = Tokenizer.remove_stop_words(text) + text = Tokenizer.remove_punctuation(text) + text = Tokenizer.remove_digits(text) + text = Tokenizer.remove_extra_whitespaces(text) + return text + + @staticmethod + def normalize_and_stem(text): + """ + Normalizes and stems given text + """ + text = Tokenizer.normalize(text) + return Tokenizer.stem(text) + + @staticmethod + def stem(word): + """ + Stems given word with Porter Stemmer + """ + return Tokenizer.stemmer.stem(word, 0, len(word) - 1) diff --git a/cmpe493/project2/CmpE493_HW2_Report.pdf b/cmpe493/project2/CmpE493_HW2_Report.pdf new file mode 100644 index 0000000..ced9aa5 Binary files /dev/null and b/cmpe493/project2/CmpE493_HW2_Report.pdf differ diff --git a/cmpe493/project2/README.md b/cmpe493/project2/README.md new file mode 100644 index 0000000..66e458a --- /dev/null +++ b/cmpe493/project2/README.md @@ -0,0 +1,9 @@ +Entry point of my application is app.py + +You should use python 3.6 + +Run my app with "python3 app.py" command. + +Stop word data have to be in current directory and named as "stopwords.txt" + +It takes about 2 minutes diff --git a/cmpe493/project2/app.py b/cmpe493/project2/app.py new file mode 100644 index 0000000..b4632e4 --- /dev/null +++ b/cmpe493/project2/app.py @@ -0,0 +1,161 @@ + +from reader import Reader +from topic import Topic + + +# TRANING +# traning_docs, test_docs = Reader.read_files('test') +traning_docs, test_docs = Reader.read_files('reuters21578') + +number_of_docs = len(traning_docs) + +# Find vocabulary length +vocabulary = [] +for doc in traning_docs: + vocabulary += doc.words +vocabulary = set(vocabulary) +vocabulary_length = len(vocabulary) + +topics = ["earn", "acq", "money-fx", "grain", "crude"] + +knowledge = {} + +# Train topic with all lexicons +for topic in topics: + t = Topic(name=topic, documents=[doc for doc in traning_docs if doc.topic == topic], total_n_docs=number_of_docs, vocabulary_length=vocabulary_length) + t.train_all_features() + knowledge[topic] = t + +# Select features +feature_vocabulary = [] +for topic in knowledge.values(): + topic.select_features(knowledge.values(), 50) + feature_vocabulary += topic.features + +# Find feature vocabulary length +feature_vocabulary = set(feature_vocabulary) +feature_vocabulary_length = len(feature_vocabulary) + +for topic in knowledge.values(): + topic.train_mutual(feature_vocabulary) + +# TESTING +print("Testing documents") +for doc in test_docs: + doc.apply_bayes_with_all_features(knowledge.values(), vocabulary) + doc.apply_bayes_with_mutual(knowledge.values(), feature_vocabulary) + +# Calculating performance +measures = {} +measures['contingency'] = { "all": { "tp":0, "fp":0, "fn":0, "tn":0 }, "mutual": { "tp":0, "fp":0, "fn":0, "tn":0 } } +measures['macro_total'] = { "all": { "precision":0, "recall":0, "f":0 }, "mutual": { "precision":0, "recall":0, "f":0 } } + +for topic in topics: + measures[topic] = { "all": { "tp":0, "fp":0, "fn":0, "tn":0 }, "mutual": { "tp":0, "fp":0, "fn":0, "tn":0 } } + for doc in test_docs: + if topic == doc.topic: + # Truth YES + + # All Lexicon + if doc.topic == doc.guess_all: + # Classifier YES + measures[topic]["all"]["tp"] += 1 + measures['contingency']["all"]["tp"] += 1 + else: + # Classifier NO + measures[topic]["all"]["fn"] += 1 + measures['contingency']["all"]["fn"] += 1 + + # Mutual Information + if doc.topic == doc.guess_mutual: + # Classifier YES + measures[topic]["mutual"]["tp"] += 1 + measures['contingency']["mutual"]["tp"] += 1 + else: + # Classifier NO + # Classifier NO + measures[topic]["mutual"]["fn"] += 1 + measures['contingency']["mutual"]["fn"] += 1 + else: + # Truth NO + # All Lexicon + if doc.topic == doc.guess_all: + # Classifier YES + measures[topic]["all"]["tn"] += 1 + measures['contingency']["all"]["tn"] += 1 + else: + # Classifier NO + measures[topic]["all"]["fp"] += 1 + measures['contingency']["all"]["fp"] += 1 + + # Mutual Information + if doc.topic == doc.guess_mutual: + # Classifier YES + measures[topic]["mutual"]["tn"] += 1 + measures['contingency']["mutual"]["tn"] += 1 + else: + # Classifier NO + measures[topic]["mutual"]["fp"] += 1 + measures['contingency']["mutual"]["fp"] += 1 + +print("\n\t\t\t\t\tDOCUMENT COUNTS") +print("Train") +total = 0 +for key, topic in knowledge.items(): + print(topic.name + ": " + str(len(topic.documents))) + total += len(topic.documents) +print("Total: " + str(total)) + +print("\n\nTest") +total = {} +for doc in test_docs: + count = total.get(doc.topic, 0) + count += 1 + total[doc.topic] = count +for topic, count in total.items(): + print(topic + ": " + str(count)) +print("Total: " + str(sum(total.values()))) + +print("\n\t\t\t\t\tSELECTED FEATURES") +for key, topic in knowledge.items(): + print(topic.name) + print(topic.features) + print("") + +print("\n\t\t\t\t\tPERFORMANCE VALUES") +# Calculate precision, recall and f-measure +for topic, measure in measures.items(): + if topic != "macro_total": + for classifier, values in measure.items(): + precision = measures[topic][classifier]['tp'] / (measures[topic][classifier]['tp'] + measures[topic][classifier]['fp']) + recall = measures[topic][classifier]['tp'] / (measures[topic][classifier]['tp'] + measures[topic][classifier]['fn']) + f = (2 * precision * recall) / (precision + recall) + + measures[topic][classifier]['precision'] = precision + measures[topic][classifier]['recall'] = recall + measures[topic][classifier]['f'] = f + + if topic != 'contingency': + print("\n" + topic + " - " + classifier) + print("\tPrecision: " + str(precision)) + print("\tRecall: " + str(recall)) + print("\tF-measure: " + str(f)) + measures['macro_total'][classifier]['precision'] += precision + measures['macro_total'][classifier]['recall'] += recall + measures['macro_total'][classifier]['f'] += f + +print("\nTraning with All Lexicon") +print("\tMacro-Averaged Precision: " + str(measures['macro_total']['all']['precision'] / len(topics))) +print("\tMicro-Averaged Precision: " + str(measures['contingency']['all']['precision'])) +print("\n\tMacro-Averaged Recall: " + str(measures['macro_total']['all']['recall'] / len(topics))) +print("\tMicro-Averaged Recall: " + str(measures['contingency']['all']['recall'])) +print("\n\tMacro-Averaged F-measure: " + str(measures['macro_total']['all']['f'] / len(topics))) +print("\tMicro-Averaged F-measure: " + str(measures['contingency']['all']['f'])) + +print("\n\nTraning with Selected Features by Mutual Information") +print("\tMacro-Averaged Precision: " + str(measures['macro_total']['mutual']['precision'] / len(topics))) +print("\tMicro-Averaged Precision: " + str(measures['contingency']['mutual']['precision'])) +print("\n\tMacro-Averaged Recall: " + str(measures['macro_total']['mutual']['recall'] / len(topics))) +print("\tMicro-Averaged Recall: " + str(measures['contingency']['mutual']['recall'])) +print("\n\tMacro-Averaged F-measure: " + str(measures['macro_total']['mutual']['f'] / len(topics))) +print("\tMicro-Averaged F-measure: " + str(measures['contingency']['mutual']['f'])) \ No newline at end of file diff --git a/cmpe493/project2/cmpe493-assignment2-specification.pdf b/cmpe493/project2/cmpe493-assignment2-specification.pdf new file mode 100644 index 0000000..306a675 Binary files /dev/null and b/cmpe493/project2/cmpe493-assignment2-specification.pdf differ diff --git a/cmpe493/project2/document.py b/cmpe493/project2/document.py new file mode 100644 index 0000000..8aa698d --- /dev/null +++ b/cmpe493/project2/document.py @@ -0,0 +1,37 @@ +import operator + +class Document: + """ + Handles documents + """ + + def __init__(self, id, words, topic): + self.id = id + self.words = words + self.topic = topic + + def apply_bayes_with_all_features(self, topics, vocabulary): + """ + Guesses topic by naive bayes that trained by all lexicon + """ + scores = {} + for topic in topics: + score = topic.prior + for word in self.words: + if word in vocabulary: + score += topic.get_word_prob_all(word) + scores[topic.name] = score + self.guess_all = max(scores.items(), key=operator.itemgetter(1))[0] + + def apply_bayes_with_mutual(self, topics, vocabulary): + """ + Guesses topic by naive bayes that trained by selected features + """ + scores = {} + for topic in topics: + score = topic.prior + for word in self.words: + if word in vocabulary: + score += topic.get_word_prob_mutual(word) + scores[topic.name] = score + self.guess_mutual = max(scores.items(), key=operator.itemgetter(1))[0] \ No newline at end of file diff --git a/cmpe493/project2/reader.py b/cmpe493/project2/reader.py new file mode 100644 index 0000000..805ea5b --- /dev/null +++ b/cmpe493/project2/reader.py @@ -0,0 +1,69 @@ +import os, re, pickle +from tokenizer import Tokenizer +from document import Document + +class Reader: + """ + Handles reading operations + """ + + topics = ["earn", "acq", "money-fx", "grain", "crude"] + + @classmethod + def read_files(self, directory=None): + """ + Returns read documents from data directory + """ + print("Reading files") + # If no directory is given, set it to current directory + directory = os.getcwd() if directory is None else directory + filenames = os.listdir(directory) + # Get all file with .sgm extension + filenames = [filename for filename in filenames if filename.endswith(".sgm")] + filenames.sort() + traning_docs = [] + test_docs = [] + # Extract documents from each file + print("Extracting documents") + for filename in filenames: + raw_data = open(os.path.join(directory, filename), "r", encoding="latin-1").read() + traning, test = self.extract_documents(raw_data) + traning_docs += traning + test_docs += test + return traning_docs, test_docs + + @classmethod + def extract_documents(self, raw_data): + """ + Extracts documents from raw string + """ + traning_docs = [] + test_docs = [] + # Seperate each document + raw_documents = raw_data.split('') + # Extract information from each raw document string + for raw_document in raw_documents: + doc_id = re.match(r'.+?NEWID=\"(?P\d+)\">.+?', raw_document, re.DOTALL) + doc_title = re.match(r'.+?(?P<title>.+?).+?', raw_document, re.DOTALL) + doc_body = re.match(r'.+?(?P.+?).+?', raw_document, re.DOTALL) + doc_topics = re.match(r'.+?(?P.+?).+?', raw_document, re.DOTALL) + if doc_topics: + doc_topics = re.findall(r'.*?(?P.+?).*?', doc_topics.group('topics'), re.DOTALL) + doc_type = re.findall(r'LEWISSPLIT=\"(?P\w+?)\"', raw_document) + doc_type = doc_type[0] if len(doc_type) == 1 else None + + # If raw corpus has ID, it's a document, add it to list + if doc_id and doc_topics and doc_type: + intersect = list(set(self.topics) & set(doc_topics)) + if len(intersect) == 1: + doc_id = int(doc_id.group('id')) + # If it's not have title or body, put empty string instead of them + doc_title = doc_title.group('title') if doc_title else '' + doc_body = doc_body.group('body') if doc_body else '' + doc_class = intersect[0] + doc = Document(id=doc_id, words=Tokenizer.tokenize(doc_title + " " + doc_body), topic=doc_class) + if doc_type == "TRAIN": + traning_docs.append(doc) + elif doc_type == "TEST": + test_docs.append(doc) + return traning_docs, test_docs \ No newline at end of file diff --git a/cmpe493/project2/stopwords.txt b/cmpe493/project2/stopwords.txt new file mode 100644 index 0000000..1b5e1f4 --- /dev/null +++ b/cmpe493/project2/stopwords.txt @@ -0,0 +1,54 @@ +a +all +an +and +any +are +as +be +been +but +by +few +for +have +he +her +here +him +his +how +i +in +is +it +its +many +me +my +none +of +on +or +our +she +some +the +their +them +there +they +that +this +us +was +what +when +where +which +who +why +will +with +you +your diff --git a/cmpe493/project2/tokenizer.py b/cmpe493/project2/tokenizer.py new file mode 100644 index 0000000..821f8e9 --- /dev/null +++ b/cmpe493/project2/tokenizer.py @@ -0,0 +1,80 @@ +import re +from string import punctuation, digits + +class Tokenizer: + """ + Handles tokenize operations + """ + + + @staticmethod + def tokenize(text): + """ + Tokenizes given text after normalizing + """ + # Normalize given text + text = Tokenizer.normalize(text) + # Split text from whitespaces + return text.split() + + @staticmethod + def make_lower_case(text): + """ + Makes given text lower case + """ + return text.lower() + + @staticmethod + def remove_stop_words(text): + """ + Removes stop words from given text + """ + # Get stop word list from file + stop_words = Tokenizer.stop_words() + # Replace stop words with empty string + remove_list_regex = re.compile(r'\b|\b'.join(map(re.escape, stop_words))) + return remove_list_regex.sub('', text) + + @staticmethod + def stop_words(): + """ + Returns stop word list from text file + """ + return open('stopwords.txt').read().split() + + @staticmethod + def remove_extra_whitespaces(text): + """ + Removes extra whitespaces from given text such as multiple adjencent space + """ + return re.sub(r'\s+', ' ', text).strip() + + @staticmethod + def remove_punctuation(text): + """ + Removes punctuations from given text + """ + # Replace punctuation with space instead of remove it for hand-to-mouth, six-week-old, euro-certificate + return text.translate(str.maketrans(punctuation, ' ' * len(punctuation))) + + @staticmethod + def remove_digits(text): + """ + Removes digits from given text + """ + return text.translate(str.maketrans('', '', digits)) + + @staticmethod + def normalize(text): + """ + Normalizes given text + Steps: + 1. Make lowercase + 2. Remove punctuation + 3. Remove digit + 4. Remove extra whitespace + """ + text = Tokenizer.make_lower_case(text) + text = Tokenizer.remove_stop_words(text) + text = Tokenizer.remove_punctuation(text) + return text \ No newline at end of file diff --git a/cmpe493/project2/topic.py b/cmpe493/project2/topic.py new file mode 100644 index 0000000..a97f687 --- /dev/null +++ b/cmpe493/project2/topic.py @@ -0,0 +1,95 @@ +from collections import Counter +import math +import operator + +class Topic: + + alpha = 1 + + def __init__(self, name, documents, total_n_docs, vocabulary_length): + self.name = name + self.documents = documents + self.text = [] + for doc in self.documents: + self.text += doc.words + self.total_n_docs = total_n_docs + self.vocabulary_length = vocabulary_length + + def train_all_features(self): + """ + Trains Naive bayes with all lexicon + """ + print("Traning \"" + self.name + "\" class with all features") + # Calculate P(c_j) + self.prior = math.log2(len(self.documents) / self.total_n_docs) + + self.words_prob_all = {} + self.words_doc_count = {} + text_length = len(self.text) + counter = Counter(self.text) + + for word in counter.keys(): + occurence = counter.get(word, 0) + # Calculate P(w | c_j) for each word + self.words_prob_all[word] = math.log2((occurence + self.alpha) / (text_length + self.alpha * self.vocabulary_length)) + + # Calculate document occurence count for each word + self.words_doc_count[word] = len([doc for doc in self.documents if word in doc.words]) + + def get_word_prob_all(self, word): + """ + Get conditional probability for word. If dictionary doesn't it, return smoothed value + """ + return self.words_prob_all.get(word, math.log2(self.alpha / (len(self.text) + self.alpha * self.vocabulary_length))) + + def train_mutual(self, feature_vocabulary): + """ + Trains Naive bayes with selected features by mutual information + """ + print("Traning \"" + self.name + "\" class with mutual information") + self.feature_vocabulary_length = len(feature_vocabulary) + self.words_prob_mutual = {} + self.feature_text = [word for word in self.text if word in feature_vocabulary] + text_length = len(self.feature_text) + counter = Counter(self.feature_text) + for word in counter.keys(): + occurence = counter.get(word, 0) + # Calculate P(w | c_j) for each word based on selected words + self.words_prob_mutual[word] = math.log2((occurence + self.alpha) / (text_length + self.alpha * self.feature_vocabulary_length)) + + def get_word_prob_mutual(self, word): + """ + Get conditional probability that calculated with selected features for word. If dictionary doesn't it, return smoothed value + """ + return self.words_prob_mutual.get(word, math.log2(self.alpha / (len(self.feature_text) + self.alpha * self.feature_vocabulary_length))) + + def get_words_doc_count(self, word, contain = True): + """ + Get documents occurences count if contain is False it returns documents count doesn't contain + """ + if contain: + return self.words_doc_count.get(word, 0) + else: + return len(self.documents) - self.words_doc_count.get(word, 0) + + def select_features(self, topics, count): + """ + Selects features via mutual information + """ + print("Selecting feature for " + self.name + " class") + # Calculate utilization for each word + words_utility = {} + for word in set(self.text): + n11 = self.get_words_doc_count(word) + n01 = self.get_words_doc_count(word, False) + n10 = 0 + n00 = 0 + for topic in topics: + if self.name != topic.name: + n10 += topic.get_words_doc_count(word) + n00 += topic.get_words_doc_count(word, False) + n = n11 + n01 + n10 + n00 + words_utility[word] = ((n11 / n) * math.log2((n * n11 + 1) / ((n11 + n10) * (n11 + n01)))) + ((n01 / n) * math.log2((n * n01 + 1) / ((n01 + n00) * (n01 + n11)))) + ((n10 / n) * math.log2((n * n10 + 1) / ((n10 + n11) * (n10 + n00)))) + ((n00 / n) * math.log2((n * n00 + 1) / ((n00 + n01) * (n00 + n10)))) + # Get first 50 feature + self.features = [x[0] for x in sorted(words_utility.items(), key=operator.itemgetter(1), reverse=True)[:50]] + diff --git a/cmpe493/project3/CmpE493_HW3_Report.pdf b/cmpe493/project3/CmpE493_HW3_Report.pdf new file mode 100644 index 0000000..bc1b172 Binary files /dev/null and b/cmpe493/project3/CmpE493_HW3_Report.pdf differ diff --git a/cmpe493/project3/README.md b/cmpe493/project3/README.md new file mode 100644 index 0000000..509275b --- /dev/null +++ b/cmpe493/project3/README.md @@ -0,0 +1,29 @@ +Entry point of my application is app.py + +You should use python 3.6.5 + +Run my app with "python3 app.py COMMAND DATA_DIRECTORY [FILE]" command. + +File name is optional. +If any file name is specified, it run it for all files in data directory + +## Available Commands ## + lex - Return lex rank scores of sentences + summary - Return generated summary of new + gold - Return gold summary of new + rouge - Return average rouge scores of new + +## Example ## + +python3 app.py lex Dataset 1.txt + Print lex rank of sentences in 1.txt + +----- + +python3 app.py lex Dataset + Print lex rank of sentences of all files line by line + +----- + +python3 app.py rouge Dataset + Print average rouge scores of all files \ No newline at end of file diff --git a/cmpe493/project3/app.py b/cmpe493/project3/app.py new file mode 100644 index 0000000..f03dfe9 --- /dev/null +++ b/cmpe493/project3/app.py @@ -0,0 +1,217 @@ +import os, sys +import math +import numpy as np +from rouge import Rouge + +# CONSTANTS +COSINE_SIMILARITY_THRESHOLD = 0.10 +TELEPORTATION_RATE = 0.15 +ERROR_TOLERANCE = 0.00001 + +def read_news(directory): + """ + Reads news from files. Returns news and summaries + """ + + filenames = os.listdir(directory) + # Get all file with .txt extension + filenames = [filename for filename in filenames if filename.endswith(".txt")] + filenames.sort() + news = {} + summaries = {} + # Extract news from each file + for filename in filenames: + raw_data = open(os.path.join(directory, filename), "r", encoding="utf-8").read() + raw_news = raw_data.split('\n\n') + news[filename] = raw_news[0].strip().split('\n') + summaries[filename] = raw_news[1].strip() + return news, summaries + +def calculate_idf(news): + """ + Calculates idf for given news + """ + + df = {} + idf = {} + N = len(news) + # Count document frequency for each term + for text in news.values(): + tokens = " ".join(text).split() + terms = set(tokens) + for term in terms: + df[term] = df.get(term, 0) + 1 + # Calculate idf for each term + for term, freq in df.items(): + idf[term] = math.log10(N / freq) + return idf + +def calculate_tf_idf(sentence, idf, terms): + """ + Calculates tf idf vector for given sentence + """ + + counts = {} + # Count term frequencies + for token in sentence.split(): + counts[token] = counts.get(token, 0) + 1 + + tf_idf = [] + for term in terms: + # Calculate tf + tf = 1 + math.log10(counts.get(term, 0.1)) + # Add tf idf to result + tf_idf.append(tf * idf.get(term)) + return tf_idf + +def unit_vector(vec): + """ + Returns unit vector that points same direction with given vector + """ + + return vec / np.linalg.norm(vec) + +def cosine_similarity(vec1, vec2): + """ + Calculated cosine similarity between given two vectors + """ + + return np.dot(unit_vector(vec1), unit_vector(vec2)) + +def print_matrix(m): + """ + Prints matrix prettier + """ + for row in m: + text = "" + for column in row: + text += "{:^10.5f}".format(column) + print(text) + +def power_iteration(m): + """ + Returns eagen vector of given matrix + """ + x = [1/len(m)] * len(m) + while True: + x_new = np.matmul(x, m) + for i in range(len(m)): + if abs(x_new[i] - x[i]) >= ERROR_TOLERANCE: + break + return x_new + x = x_new + +def calculate_lex_rank(sentences, idf): + """ + Calculates lex rank of given sentences + """ + + lex_ranks = [] + tf_idf = [] + terms = list(set(" ".join(sentences).split())) + dim = len(sentences) + # Calculate tf idfs of sentences + for sentence in sentences: + tf_idf.append(calculate_tf_idf(sentence, idf, terms)) + + # Build adjencency matrix with 0 and 1 + adj_mat = [] + for x in range(dim): + adj_mat.append([]) + for y in range(dim): + cos_sim = cosine_similarity(tf_idf[x], tf_idf[y]) + adj_mat[x].append(1 if (cos_sim >= COSINE_SIMILARITY_THRESHOLD) else 0) + + # Convert adjencency matrix to probability matrix with teleportation rate + for x in range(dim): + N = sum(adj_mat[x]) + for y in range(dim): + adj_mat[x][y] = ((adj_mat[x][y] / N) * (1 - TELEPORTATION_RATE)) + (TELEPORTATION_RATE / dim) + + return list(power_iteration(adj_mat)) + +def summarize(sentences, idf): + """ + Generates summary for given sentences + """ + + maxest = [] + lex_rank = calculate_lex_rank(sentences, idf) + + # Find three sentences indexes that have highest lex rank + for i in range(3): + max_lex = max(lex_rank) + max_index = lex_rank.index(max_lex) + maxest.append(max_index) + lex_rank[max_index] = 0 + + # Sort indexes. Don't change topic flow + maxest.sort() + summary = [] + for index in maxest: + summary.append(sentences[index]) + + return "\n".join(summary) + +############################### +####### APP ENTRY POINT ####### +############################### + +# Check arguments +if len(sys.argv) < 2: + print("You have to give command name") + print("python3 app.py [COMMAND] [DATA_DIRECTORY] [FILE_NAME]") + exit(1) +elif len(sys.argv) < 3: + print("You have to give directory name") + print("python3 app.py [COMMAND] [DATA_DIRECTORY] [FILE_NAME]") + exit(1) + +command = sys.argv[1] +directory = sys.argv[2] + +# Read data set +news, summaries = read_news(directory) + +# if no file is given process all of them +files = [sys.argv[3]] if len(sys.argv) == 4 else news.keys() + +# Calculate IDFs +idf = calculate_idf(news) + +# Run command +if command == "lex": + for file in files: + lex_rank = calculate_lex_rank(news[file], idf) + print(" ".join(["{:.3f}".format(rank) for rank in lex_rank])) +elif command == "summary": + for file in files: + print(summarize(news[file], idf)) +elif command == "gold": + for file in files: + print(summaries[file]) +elif command == "rouge": + rouge = Rouge() + total = {} + total["rouge-1"] = {"f":0, "r":0, "p":0} + total["rouge-2"] = {"f":0, "r":0, "p":0} + total["rouge-l"] = {"f":0, "r":0, "p":0} + for file in files: + generated_summary = summarize(news[file], idf) + gold_summary = summaries[file] + scores = rouge.get_scores(gold_summary, generated_summary) + for type in ["rouge-1", "rouge-2", "rouge-l"]: + for stat in ["p", "r", "f"]: + total[type][stat] += scores[0][type][stat] + + # Divide sum to lenghts + for type in ["rouge-1", "rouge-2", "rouge-l"]: + for stat in ["p", "r", "f"]: + total[type][stat] /= len(files) + + print("Average Rouge Scores") + for type in ["rouge-1", "rouge-2", "rouge-l"]: + print(type) + for stat in ["p", "r", "f"]: + print("\t" + stat + ": " + str(total[type][stat])) + print("\n") \ No newline at end of file diff --git a/cmpe493/project3/cmpe493-assignment3-specification.pdf b/cmpe493/project3/cmpe493-assignment3-specification.pdf new file mode 100644 index 0000000..e1679c7 Binary files /dev/null and b/cmpe493/project3/cmpe493-assignment3-specification.pdf differ