-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
25 changed files
with
1,888 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# CmpE493 - Introduction to Information Retrieval | ||
| Term | Instructor | | ||
| --- | --- | | ||
| Spring 2018 | Arzucan Özgür | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Entry point of my application is app.py | ||
|
||
You should use python3 | ||
|
||
Run my app with "python3 app.py" command. | ||
|
||
If you run it first time, it creates inverted index. | ||
|
||
Default directory name is "reuters21578" for data. | ||
|
||
Stop word data have to be in current directory and named as "stopwords.txt" | ||
|
||
You can use help command for further information |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
import os | ||
import re | ||
from indexer import Indexer | ||
from query import Query | ||
from query import QueryType | ||
|
||
def make_bold(string): | ||
""" | ||
Makes given string bold for terminal with escape characters | ||
""" | ||
return "\033[1m{}\033[0m".format(string) | ||
|
||
def command_help(): | ||
""" | ||
Prints help page of the application to terminal | ||
""" | ||
print("\n\n\t\t\t\t\t" + make_bold("--- REUTERS SEARCH ENGINE ---")) | ||
print(make_bold("COMMANDS")) | ||
print("\t" + make_bold("exit") + "\t\t Exits from program") | ||
print("\t" + make_bold("help") + "\t\t Lists available commands") | ||
print("\t" + make_bold("index [FOLDER]") + "\t Indexes document positional inverted index. Default: reuters21578") | ||
print("\t" + make_bold("clear") + "\t\t Clear console screen") | ||
print("\t" + make_bold("remove") + "\t\t Removes inverted index files") | ||
print("\t" + make_bold("postings WORD") + "\t Returns postings of word") | ||
print("\n\t" + "** There is no special command for query processing.") | ||
print("\t" + "Inputs that aren't special command interpreted as query") | ||
print("\n\t" + make_bold("[QUERY_TYPE] YOUR_QUERY") + "\t Processes query based on given type.") | ||
print("\t\t\t\t If no type is given, it predicts query type") | ||
print("\t" + make_bold("Query Types")) | ||
print("\t\t" + make_bold(QueryType.CONJUNCTIVE) + " -> Conjunctive Query") | ||
print("\t\t" + make_bold(QueryType.PHRASE) + " -> Phrase Query") | ||
print("\t\t" + make_bold(QueryType.PROXIMITY) + " -> Proximity Query") | ||
print("\n\n\n") | ||
|
||
def command_index(directory): | ||
""" | ||
Indexes data that from given directory again | ||
""" | ||
global dictionary | ||
global index | ||
Indexer.remove_index() | ||
# Set default data directory | ||
if directory is None: | ||
directory = 'reuters21578' | ||
print('Indexing ' + directory + ' folder...') | ||
Indexer.create_index(directory=directory) | ||
dictionary, index = Indexer.get_index() | ||
print('Index created') | ||
|
||
def command_remove(): | ||
""" | ||
Removes current index files | ||
""" | ||
Indexer.remove_index() | ||
global dictionary | ||
global index | ||
dictionary = {} | ||
index = {} | ||
print('Index removed') | ||
|
||
def command_postings(word, dictionary, index): | ||
""" | ||
Returns postings of given word | ||
""" | ||
postings = Indexer.get_postings(word, dictionary, index) | ||
print(postings) | ||
|
||
def command_exit(): | ||
""" | ||
Exits from application | ||
""" | ||
print("Goodbye...") | ||
exit(1) | ||
|
||
def command_clear(): | ||
""" | ||
Clears terminal screen | ||
""" | ||
os.system("clear") | ||
|
||
#################################### | ||
########## APP START HERE ########## | ||
#################################### | ||
|
||
# If the index isn't created create it | ||
if not Indexer.is_indexed(): | ||
command_index(None) | ||
else: | ||
print('Data is already indexed') | ||
|
||
dictionary, index = Indexer.get_index() | ||
|
||
print("Type " + make_bold("help") + " for any documentation") | ||
while True: | ||
# Get command from user and processes it | ||
command = input("query> ") | ||
postings_command = re.match(r'^postings\s(\w+)', command) | ||
index_command = re.match(r'^index\s?(\w+)?', command) | ||
if command == "exit": | ||
command_exit() | ||
elif index_command: | ||
command_index(index_command.group(1)) | ||
elif command == "help": | ||
command_help() | ||
elif command == "clear": | ||
command_clear() | ||
elif command == "remove": | ||
command_remove() | ||
elif postings_command: | ||
command_postings(postings_command.group(1), dictionary, index) | ||
else: | ||
query = Query(command) | ||
result = query.run(dictionary, index) | ||
print(make_bold(str(len(result)) + ' documents are founded')) | ||
print(sorted(result)) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
import os, re, pickle | ||
from tokenizer import Tokenizer | ||
|
||
class Indexer: | ||
""" | ||
Handles inverted index operations | ||
""" | ||
DICTIONARY_NAME = 'dictionary.txt' # Name of the dictionary file | ||
INDEX_NAME = 'inverted_index.txt' # Name of the inverted index file | ||
POSTING_ID = 1 # Starting ID for posting lists | ||
|
||
@classmethod | ||
def read_files(self, directory=None): | ||
""" | ||
Returns read documents from data directory | ||
""" | ||
# If no directory is given, set it to current directory | ||
directory = os.getcwd() if directory is None else directory | ||
filenames = os.listdir(directory) | ||
# Get all file with .sgm extension | ||
filenames = [filename for filename in filenames if filename.endswith(".sgm")] | ||
filenames.sort() | ||
documents = [] | ||
# Extract documents from each file | ||
for filename in filenames: | ||
raw_data = open(os.path.join(directory, filename), "r", encoding="latin-1").read() | ||
documents += self.extract_documents(raw_data) | ||
return documents | ||
|
||
@classmethod | ||
def extract_documents(self, raw_data): | ||
""" | ||
Extracts documents from raw string | ||
""" | ||
# Some news don't have body or title | ||
# return re.findall(r'<REUTERS.*?NEWID=\"(?P<id>\d+)\">.*?<TITLE>(?P<title>.*?)</TITLE>.*?<BODY>(?P<body>.*?)</BODY>.*?</REUTERS>', raw_data, re.DOTALL) | ||
documents = [] | ||
# Seperate each document | ||
raw_documents = raw_data.split('</REUTERS>') | ||
# Extract information from each raw document string | ||
for raw_document in raw_documents: | ||
doc_id = re.match(r'.+?NEWID=\"(?P<id>\d+)\">.+?', raw_document, re.DOTALL) | ||
doc_title = re.match(r'.+?<TITLE>(?P<title>.+?)</TITLE>.+?', raw_document, re.DOTALL) | ||
doc_body = re.match(r'.+?<BODY>(?P<body>.+?)</BODY>.+?', raw_document, re.DOTALL) | ||
|
||
# If raw corpus has ID, it's a document, add it to list | ||
if doc_id: | ||
doc_id = int(doc_id.group('id')) | ||
# If it's not have title or body, put empty string instead of them | ||
doc_title = doc_title.group('title') if doc_title else '' | ||
doc_body = doc_body.group('body') if doc_body else '' | ||
documents.append({'id': doc_id, 'title': doc_title, 'body':doc_body}) | ||
return documents | ||
|
||
@classmethod | ||
def create_index(self, directory=None): | ||
""" | ||
Creates index from data that in given directory | ||
""" | ||
# Read files and get documents | ||
documents = self.read_files(directory) | ||
# Initialize directory and inverted index | ||
dictionary = {} | ||
inverted_index = {} | ||
# Load stop words from file | ||
stop_words = Tokenizer.stop_words() | ||
|
||
for document in documents: | ||
doc_id = document['id'] | ||
# Concatenate title and body, then tokenize this combination | ||
tokens = Tokenizer.tokenize(document['title'] + ' ' + document['body']) | ||
# Iterate all tokens and if it's not a stop word, add it to index with it's position | ||
for position, token in enumerate(tokens): | ||
if not token in stop_words: | ||
# Get ID of positional indexes of the token | ||
postings_id = dictionary.get(token, self.get_posting_id()) | ||
# Get positional indexes of token as dictionary | ||
postings = inverted_index.get(postings_id, {}) | ||
# Get positions of the token in the document as list | ||
positions = postings.get(doc_id, []) | ||
# Add this position to positional index | ||
positions.append(position) | ||
# Put positions list of the this document back to token's document's list | ||
postings[doc_id] = positions | ||
# Put updated positional indexes of the token back to inverted index | ||
inverted_index[postings_id] = postings | ||
# Update ID of the token in dictionary | ||
dictionary[token] = postings_id | ||
# Save created index to file | ||
self.save_index(dictionary, inverted_index) | ||
|
||
@classmethod | ||
def get_posting_id(self): | ||
""" | ||
Returns globally incremented ID for next postings list | ||
""" | ||
self.POSTING_ID += 1 | ||
return self.POSTING_ID - 1 | ||
|
||
@classmethod | ||
def get_postings(self, token, dictionary, index): | ||
""" | ||
Returns documents and positions of given token after normalization | ||
""" | ||
stem = Tokenizer.normalize_and_stem(token) | ||
posting_id = dictionary.get(stem) | ||
return index.get(posting_id, {}) | ||
|
||
@classmethod | ||
def save_index(self, directory, index): | ||
""" | ||
Save dictionary and inverted index to file | ||
""" | ||
pickle.dump(directory, open(self.DICTIONARY_NAME, 'wb')) | ||
pickle.dump(index, open(self.INDEX_NAME, 'wb')) | ||
|
||
@classmethod | ||
def get_index(self): | ||
""" | ||
Load dictionary and inverted index from file | ||
Returns: | ||
dictionary, index | ||
""" | ||
return pickle.load(open(self.DICTIONARY_NAME, 'rb')), pickle.load(open(self.INDEX_NAME, 'rb')) | ||
|
||
@classmethod | ||
def remove_index(self): | ||
""" | ||
Removes old inverted index files | ||
""" | ||
try: | ||
os.remove(self.DICTIONARY_NAME) | ||
os.remove(self.INDEX_NAME) | ||
except OSError: | ||
pass | ||
|
||
@classmethod | ||
def is_indexed(self): | ||
""" | ||
Checks if index is exist | ||
""" | ||
return os.path.isfile(self.DICTIONARY_NAME) and os.path.isfile(self.INDEX_NAME) |
Oops, something went wrong.