Skip to content

Commit

Permalink
Add CmpE493 projects
Browse files Browse the repository at this point in the history
  • Loading branch information
enescakir committed May 28, 2018
1 parent 8594829 commit 8163e20
Show file tree
Hide file tree
Showing 25 changed files with 1,888 additions and 0 deletions.
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,10 @@ Programming projects that I have written at university
- [Project 1](/cmpe362/project1) `Matlab`
- [Project 2](/cmpe362/project2) `Matlab`
- [Project 3](/cmpe362/project3) `Matlab`
-
### [CmpE493 - Introduction to Information Retrieval](/cmpe493)
- [Project 1](/cmpe493/project1) `Python`
- [Project 2](/cmpe493/project2) `Python`
- [Project 3](/cmpe493/project3) `Python`

### [CmpE352 & CmpE451 - Project Development in Software Engineering](https://github.com/bounswe/bounswe2017group11)
4 changes: 4 additions & 0 deletions cmpe493/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# CmpE493 - Introduction to Information Retrieval
| Term | Instructor |
| --- | --- |
| Spring 2018 | Arzucan Özgür |
13 changes: 13 additions & 0 deletions cmpe493/project1/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Entry point of my application is app.py

You should use python3

Run my app with "python3 app.py" command.

If you run it first time, it creates inverted index.

Default directory name is "reuters21578" for data.

Stop word data have to be in current directory and named as "stopwords.txt"

You can use help command for further information
Binary file added cmpe493/project1/Report.pdf
Binary file not shown.
115 changes: 115 additions & 0 deletions cmpe493/project1/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import os
import re
from indexer import Indexer
from query import Query
from query import QueryType

def make_bold(string):
"""
Makes given string bold for terminal with escape characters
"""
return "\033[1m{}\033[0m".format(string)

def command_help():
"""
Prints help page of the application to terminal
"""
print("\n\n\t\t\t\t\t" + make_bold("--- REUTERS SEARCH ENGINE ---"))
print(make_bold("COMMANDS"))
print("\t" + make_bold("exit") + "\t\t Exits from program")
print("\t" + make_bold("help") + "\t\t Lists available commands")
print("\t" + make_bold("index [FOLDER]") + "\t Indexes document positional inverted index. Default: reuters21578")
print("\t" + make_bold("clear") + "\t\t Clear console screen")
print("\t" + make_bold("remove") + "\t\t Removes inverted index files")
print("\t" + make_bold("postings WORD") + "\t Returns postings of word")
print("\n\t" + "** There is no special command for query processing.")
print("\t" + "Inputs that aren't special command interpreted as query")
print("\n\t" + make_bold("[QUERY_TYPE] YOUR_QUERY") + "\t Processes query based on given type.")
print("\t\t\t\t If no type is given, it predicts query type")
print("\t" + make_bold("Query Types"))
print("\t\t" + make_bold(QueryType.CONJUNCTIVE) + " -> Conjunctive Query")
print("\t\t" + make_bold(QueryType.PHRASE) + " -> Phrase Query")
print("\t\t" + make_bold(QueryType.PROXIMITY) + " -> Proximity Query")
print("\n\n\n")

def command_index(directory):
"""
Indexes data that from given directory again
"""
global dictionary
global index
Indexer.remove_index()
# Set default data directory
if directory is None:
directory = 'reuters21578'
print('Indexing ' + directory + ' folder...')
Indexer.create_index(directory=directory)
dictionary, index = Indexer.get_index()
print('Index created')

def command_remove():
"""
Removes current index files
"""
Indexer.remove_index()
global dictionary
global index
dictionary = {}
index = {}
print('Index removed')

def command_postings(word, dictionary, index):
"""
Returns postings of given word
"""
postings = Indexer.get_postings(word, dictionary, index)
print(postings)

def command_exit():
"""
Exits from application
"""
print("Goodbye...")
exit(1)

def command_clear():
"""
Clears terminal screen
"""
os.system("clear")

####################################
########## APP START HERE ##########
####################################

# If the index isn't created create it
if not Indexer.is_indexed():
command_index(None)
else:
print('Data is already indexed')

dictionary, index = Indexer.get_index()

print("Type " + make_bold("help") + " for any documentation")
while True:
# Get command from user and processes it
command = input("query> ")
postings_command = re.match(r'^postings\s(\w+)', command)
index_command = re.match(r'^index\s?(\w+)?', command)
if command == "exit":
command_exit()
elif index_command:
command_index(index_command.group(1))
elif command == "help":
command_help()
elif command == "clear":
command_clear()
elif command == "remove":
command_remove()
elif postings_command:
command_postings(postings_command.group(1), dictionary, index)
else:
query = Query(command)
result = query.run(dictionary, index)
print(make_bold(str(len(result)) + ' documents are founded'))
print(sorted(result))
Binary file not shown.
142 changes: 142 additions & 0 deletions cmpe493/project1/indexer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
import os, re, pickle
from tokenizer import Tokenizer

class Indexer:
"""
Handles inverted index operations
"""
DICTIONARY_NAME = 'dictionary.txt' # Name of the dictionary file
INDEX_NAME = 'inverted_index.txt' # Name of the inverted index file
POSTING_ID = 1 # Starting ID for posting lists

@classmethod
def read_files(self, directory=None):
"""
Returns read documents from data directory
"""
# If no directory is given, set it to current directory
directory = os.getcwd() if directory is None else directory
filenames = os.listdir(directory)
# Get all file with .sgm extension
filenames = [filename for filename in filenames if filename.endswith(".sgm")]
filenames.sort()
documents = []
# Extract documents from each file
for filename in filenames:
raw_data = open(os.path.join(directory, filename), "r", encoding="latin-1").read()
documents += self.extract_documents(raw_data)
return documents

@classmethod
def extract_documents(self, raw_data):
"""
Extracts documents from raw string
"""
# Some news don't have body or title
# return re.findall(r'<REUTERS.*?NEWID=\"(?P<id>\d+)\">.*?<TITLE>(?P<title>.*?)</TITLE>.*?<BODY>(?P<body>.*?)</BODY>.*?</REUTERS>', raw_data, re.DOTALL)
documents = []
# Seperate each document
raw_documents = raw_data.split('</REUTERS>')
# Extract information from each raw document string
for raw_document in raw_documents:
doc_id = re.match(r'.+?NEWID=\"(?P<id>\d+)\">.+?', raw_document, re.DOTALL)
doc_title = re.match(r'.+?<TITLE>(?P<title>.+?)</TITLE>.+?', raw_document, re.DOTALL)
doc_body = re.match(r'.+?<BODY>(?P<body>.+?)</BODY>.+?', raw_document, re.DOTALL)

# If raw corpus has ID, it's a document, add it to list
if doc_id:
doc_id = int(doc_id.group('id'))
# If it's not have title or body, put empty string instead of them
doc_title = doc_title.group('title') if doc_title else ''
doc_body = doc_body.group('body') if doc_body else ''
documents.append({'id': doc_id, 'title': doc_title, 'body':doc_body})
return documents

@classmethod
def create_index(self, directory=None):
"""
Creates index from data that in given directory
"""
# Read files and get documents
documents = self.read_files(directory)
# Initialize directory and inverted index
dictionary = {}
inverted_index = {}
# Load stop words from file
stop_words = Tokenizer.stop_words()

for document in documents:
doc_id = document['id']
# Concatenate title and body, then tokenize this combination
tokens = Tokenizer.tokenize(document['title'] + ' ' + document['body'])
# Iterate all tokens and if it's not a stop word, add it to index with it's position
for position, token in enumerate(tokens):
if not token in stop_words:
# Get ID of positional indexes of the token
postings_id = dictionary.get(token, self.get_posting_id())
# Get positional indexes of token as dictionary
postings = inverted_index.get(postings_id, {})
# Get positions of the token in the document as list
positions = postings.get(doc_id, [])
# Add this position to positional index
positions.append(position)
# Put positions list of the this document back to token's document's list
postings[doc_id] = positions
# Put updated positional indexes of the token back to inverted index
inverted_index[postings_id] = postings
# Update ID of the token in dictionary
dictionary[token] = postings_id
# Save created index to file
self.save_index(dictionary, inverted_index)

@classmethod
def get_posting_id(self):
"""
Returns globally incremented ID for next postings list
"""
self.POSTING_ID += 1
return self.POSTING_ID - 1

@classmethod
def get_postings(self, token, dictionary, index):
"""
Returns documents and positions of given token after normalization
"""
stem = Tokenizer.normalize_and_stem(token)
posting_id = dictionary.get(stem)
return index.get(posting_id, {})

@classmethod
def save_index(self, directory, index):
"""
Save dictionary and inverted index to file
"""
pickle.dump(directory, open(self.DICTIONARY_NAME, 'wb'))
pickle.dump(index, open(self.INDEX_NAME, 'wb'))

@classmethod
def get_index(self):
"""
Load dictionary and inverted index from file
Returns:
dictionary, index
"""
return pickle.load(open(self.DICTIONARY_NAME, 'rb')), pickle.load(open(self.INDEX_NAME, 'rb'))

@classmethod
def remove_index(self):
"""
Removes old inverted index files
"""
try:
os.remove(self.DICTIONARY_NAME)
os.remove(self.INDEX_NAME)
except OSError:
pass

@classmethod
def is_indexed(self):
"""
Checks if index is exist
"""
return os.path.isfile(self.DICTIONARY_NAME) and os.path.isfile(self.INDEX_NAME)
Loading

0 comments on commit 8163e20

Please sign in to comment.