Add CmpE493 projects

enescakir · May 28, 2018 · 8163e20 · 8163e20
1 parent 8594829
commit 8163e20
Show file tree

Hide file tree

Showing 25 changed files with 1,888 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -52,5 +52,10 @@ Programming projects that I have written at university
 - [Project 1](/cmpe362/project1) `Matlab`
 - [Project 2](/cmpe362/project2) `Matlab`
 - [Project 3](/cmpe362/project3) `Matlab`
+- 
+### [CmpE493 - Introduction to Information Retrieval](/cmpe493)
+- [Project 1](/cmpe493/project1) `Python`
+- [Project 2](/cmpe493/project2) `Python`
+- [Project 3](/cmpe493/project3) `Python`
 
 ### [CmpE352 & CmpE451 - Project Development in Software Engineering](https://github.com/bounswe/bounswe2017group11)
diff --git a/cmpe493/README.md b/cmpe493/README.md
@@ -0,0 +1,4 @@
+# CmpE493 - Introduction to Information Retrieval
+| Term | Instructor |
+| --- | --- |
+| Spring 2018  | Arzucan Özgür  |
diff --git a/cmpe493/project1/README.md b/cmpe493/project1/README.md
@@ -0,0 +1,13 @@
+Entry point of my application is app.py
+
+You should use python3
+
+Run my app with "python3 app.py" command.
+
+If you run it first time, it creates inverted index.
+
+Default directory name is "reuters21578" for data.
+
+Stop word data have to be in current directory and named as "stopwords.txt"
+
+You can use help command for further information
diff --git a/cmpe493/project1/Report.pdf b/cmpe493/project1/Report.pdf
diff --git a/cmpe493/project1/app.py b/cmpe493/project1/app.py
@@ -0,0 +1,115 @@
+import os
+import re
+from indexer import Indexer
+from query import Query
+from query import QueryType
+
+def make_bold(string):
+    """
+        Makes given string bold for terminal with escape characters
+    """
+    return "\033[1m{}\033[0m".format(string)
+
+def command_help():
+    """
+        Prints help page of the application to terminal
+    """
+    print("\n\n\t\t\t\t\t" + make_bold("--- REUTERS SEARCH ENGINE ---"))
+    print(make_bold("COMMANDS"))
+    print("\t" + make_bold("exit") + "\t\t Exits from program")
+    print("\t" + make_bold("help") + "\t\t Lists available commands")
+    print("\t" + make_bold("index [FOLDER]") + "\t Indexes document positional inverted index. Default: reuters21578")
+    print("\t" + make_bold("clear") + "\t\t Clear console screen")
+    print("\t" + make_bold("remove") + "\t\t Removes inverted index files")
+    print("\t" + make_bold("postings WORD") + "\t Returns postings of word")
+    print("\n\t" + "** There is no special command for query processing.")
+    print("\t" + "Inputs that aren't special command interpreted as query")
+    print("\n\t" + make_bold("[QUERY_TYPE] YOUR_QUERY") + "\t Processes query based on given type.")
+    print("\t\t\t\t If no type is given, it predicts query type")
+    print("\t" + make_bold("Query Types"))
+    print("\t\t" + make_bold(QueryType.CONJUNCTIVE) + " -> Conjunctive Query")
+    print("\t\t" + make_bold(QueryType.PHRASE) + " -> Phrase Query")
+    print("\t\t" + make_bold(QueryType.PROXIMITY) + " -> Proximity Query")
+    print("\n\n\n")
+
+def command_index(directory):
+    """
+        Indexes data that from given directory again 
+    """
+    global dictionary
+    global index
+    Indexer.remove_index()
+    # Set default data directory
+    if directory is None:
+        directory = 'reuters21578'
+    print('Indexing ' + directory + ' folder...')
+    Indexer.create_index(directory=directory)
+    dictionary, index = Indexer.get_index()
+    print('Index created')
+
+def command_remove():
+    """
+        Removes current index files
+    """
+    Indexer.remove_index()
+    global dictionary
+    global index
+    dictionary = {}
+    index = {}
+    print('Index removed')
+
+def command_postings(word, dictionary, index):
+    """
+        Returns postings of given word
+    """
+    postings = Indexer.get_postings(word, dictionary, index)
+    print(postings)
+
+def command_exit():
+    """
+        Exits from application
+    """
+    print("Goodbye...")
+    exit(1)
+
+def command_clear():
+    """
+        Clears terminal screen
+    """
+    os.system("clear")
+
+####################################
+########## APP START HERE ##########
+####################################
+
+# If the index isn't created create it
+if not Indexer.is_indexed():
+    command_index(None)
+else:
+    print('Data is already indexed')
+
+dictionary, index = Indexer.get_index()
+
+print("Type " + make_bold("help") + " for any documentation")
+while True:
+    # Get command from user and processes it
+    command = input("query> ")
+    postings_command = re.match(r'^postings\s(\w+)', command)
+    index_command = re.match(r'^index\s?(\w+)?', command)
+    if command == "exit":
+        command_exit()
+    elif index_command:
+        command_index(index_command.group(1))
+    elif command == "help":
+        command_help()
+    elif command == "clear":
+        command_clear()
+    elif command == "remove":
+        command_remove()
+    elif postings_command:
+        command_postings(postings_command.group(1), dictionary, index)
+    else:
+        query = Query(command)
+        result = query.run(dictionary, index)
+        print(make_bold(str(len(result)) + ' documents are founded'))
+        print(sorted(result))
diff --git a/cmpe493/project1/cmpe493-assignment1-specification.pdf b/cmpe493/project1/cmpe493-assignment1-specification.pdf
diff --git a/cmpe493/project1/indexer.py b/cmpe493/project1/indexer.py
@@ -0,0 +1,142 @@
+import os, re, pickle
+from tokenizer import Tokenizer
+
+class Indexer:
+    """
+        Handles inverted index operations
+    """
+    DICTIONARY_NAME = 'dictionary.txt' # Name of the dictionary file
+    INDEX_NAME = 'inverted_index.txt'  # Name of the inverted index file
+    POSTING_ID = 1 # Starting ID for posting lists
+
+    @classmethod
+    def read_files(self, directory=None):
+        """
+            Returns read documents from data directory
+        """
+        # If no directory is given, set it to current directory
+        directory = os.getcwd() if directory is None else directory
+        filenames = os.listdir(directory)
+        # Get all file with .sgm extension
+        filenames = [filename for filename in filenames if filename.endswith(".sgm")]
+        filenames.sort()
+        documents = []
+        # Extract documents from each file
+        for filename in filenames:
+            raw_data = open(os.path.join(directory, filename), "r", encoding="latin-1").read()
+            documents += self.extract_documents(raw_data)
+        return documents
+
+    @classmethod
+    def extract_documents(self, raw_data):
+        """
+            Extracts documents from raw string
+        """
+        # Some news don't have body or title
+        # return re.findall(r'<REUTERS.*?NEWID=\"(?P<id>\d+)\">.*?<TITLE>(?P<title>.*?)</TITLE>.*?<BODY>(?P<body>.*?)</BODY>.*?</REUTERS>', raw_data, re.DOTALL)
+        documents = []
+        # Seperate each document
+        raw_documents = raw_data.split('</REUTERS>')
+        # Extract information from each raw document string
+        for raw_document in raw_documents:
+            doc_id = re.match(r'.+?NEWID=\"(?P<id>\d+)\">.+?', raw_document, re.DOTALL)
+            doc_title = re.match(r'.+?<TITLE>(?P<title>.+?)</TITLE>.+?', raw_document, re.DOTALL)
+            doc_body = re.match(r'.+?<BODY>(?P<body>.+?)</BODY>.+?', raw_document, re.DOTALL)
+
+            # If raw corpus has ID, it's a document, add it to list
+            if doc_id:
+                doc_id = int(doc_id.group('id'))
+                # If it's not have title or body, put empty string instead of them 
+                doc_title = doc_title.group('title') if doc_title else ''
+                doc_body = doc_body.group('body') if doc_body else ''
+                documents.append({'id': doc_id, 'title': doc_title, 'body':doc_body})
+        return documents
+
+    @classmethod
+    def create_index(self, directory=None):
+        """
+            Creates index from data that in given directory
+        """
+        # Read files and get documents
+        documents = self.read_files(directory)
+        # Initialize directory and inverted index
+        dictionary = {}
+        inverted_index = {}
+        # Load stop words from file
+        stop_words = Tokenizer.stop_words()
+
+        for document in documents:
+            doc_id = document['id']
+            # Concatenate title and body, then tokenize this combination
+            tokens = Tokenizer.tokenize(document['title'] + ' ' + document['body'])
+            # Iterate all tokens and if it's not a stop word, add it to index with it's position
+            for position, token in enumerate(tokens):
+                if not token in stop_words:
+                    # Get ID of positional indexes of the token
+                    postings_id = dictionary.get(token, self.get_posting_id())
+                    # Get positional indexes of token as dictionary
+                    postings = inverted_index.get(postings_id, {})
+                    # Get positions of the token in the document as list
+                    positions = postings.get(doc_id, [])
+                    # Add this position to positional index
+                    positions.append(position)
+                    # Put positions list of the this document back to token's document's list
+                    postings[doc_id] = positions
+                    # Put updated positional indexes of the token back to inverted index
+                    inverted_index[postings_id] = postings
+                    # Update ID of the token in dictionary
+                    dictionary[token] = postings_id
+        # Save created index to file
+        self.save_index(dictionary, inverted_index)
+
+    @classmethod
+    def get_posting_id(self):
+        """
+            Returns globally incremented ID for next postings list
+        """
+        self.POSTING_ID += 1
+        return self.POSTING_ID - 1
+
+    @classmethod
+    def get_postings(self, token, dictionary, index):
+        """
+            Returns documents and positions of given token after normalization
+        """
+        stem = Tokenizer.normalize_and_stem(token)
+        posting_id = dictionary.get(stem)
+        return index.get(posting_id, {})
+
+    @classmethod
+    def save_index(self, directory, index):
+        """
+            Save dictionary and inverted index to file
+        """
+        pickle.dump(directory, open(self.DICTIONARY_NAME, 'wb'))
+        pickle.dump(index, open(self.INDEX_NAME, 'wb'))
+
+    @classmethod
+    def get_index(self):
+        """
+            Load dictionary and inverted index from file
+            Returns: 
+                dictionary, index 
+        """
+        return pickle.load(open(self.DICTIONARY_NAME, 'rb')), pickle.load(open(self.INDEX_NAME, 'rb'))
+
+    @classmethod
+    def remove_index(self):
+        """
+            Removes old inverted index files
+        """
+        try:
+            os.remove(self.DICTIONARY_NAME)
+            os.remove(self.INDEX_NAME)
+        except OSError:
+            pass
+
+    @classmethod
+    def is_indexed(self):
+        """
+            Checks if index is exist
+        """
+        return os.path.isfile(self.DICTIONARY_NAME) and os.path.isfile(self.INDEX_NAME)