Skip to content

Commit

Permalink
#171 Removing guava dependency and adding bloom filter.
Browse files Browse the repository at this point in the history
  • Loading branch information
jzonthemtn committed Dec 13, 2024
1 parent e42391b commit 2150d10
Show file tree
Hide file tree
Showing 12 changed files with 94 additions and 61 deletions.
11 changes: 5 additions & 6 deletions phileas-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@
<artifactId>retrofit</artifactId>
<version>${retrofit.version}</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${gson.version}</version>
</dependency>
<dependency>
<groupId>com.squareup.retrofit2</groupId>
<artifactId>converter-gson</artifactId>
Expand Down Expand Up @@ -124,12 +129,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -826,8 +826,7 @@ public List<Filter> getFiltersForPolicy(final Policy policy, final Map<String, M

final String classification = customDictionary.getClassification();

enabledFilters.add(new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration,
terms, classification, phileasConfiguration.bloomFilterFpp()));
enabledFilters.add(new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, terms, classification));

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.services.MetricsService;
import com.google.common.reflect.TypeToken;

import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import okhttp3.Authenticator;
import okhttp3.ConnectionPool;
import okhttp3.Credentials;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void filterDictionaryExactMatch() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george", "ted", "Bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived with Bill in California.", attributes);

Expand All @@ -74,7 +74,7 @@ public void filterDictionaryCaseInsensitiveMatch() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived with Bill in California.", attributes);

Expand All @@ -97,7 +97,7 @@ public void filterDictionaryNoMatch() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived with Sam in California.", attributes);

Expand All @@ -118,7 +118,7 @@ public void filterDictionaryPhraseMatch1() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george jones", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"He lived with george jones in California.", attributes);

Expand All @@ -141,7 +141,7 @@ public void filterDictionaryPhraseMatch2() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george jones jr", "ted", "bill smith", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"Bill Smith lived with george jones jr in California.", attributes);

Expand Down
11 changes: 0 additions & 11 deletions phileas-model/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,6 @@
<artifactId>antlr4-runtime</artifactId>
<version>${antlr.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
Expand Down Expand Up @@ -150,12 +145,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ public int spanWindowSize() {
return Integer.parseInt(getProperty("span.window.size", "5"));
}

public double bloomFilterFpp() {
return Double.parseDouble(getProperty("filter.fpp", "0.05"));
}

// Caching

public boolean cacheRedisEnabled() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@
*/
package ai.philterd.phileas.model.filter.rules.dictionary;

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import ai.philterd.phileas.model.enums.FilterType;
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Replacement;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.utils.BloomFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* A filter that operates on a bloom filter.
Expand All @@ -48,18 +50,16 @@ public class BloomFilterDictionaryFilter extends DictionaryFilter {
* @param filterConfiguration The {@link FilterConfiguration} for the filter.
* @param terms
* @param classification
* @param fpp
*/
public BloomFilterDictionaryFilter(FilterType filterType,
FilterConfiguration filterConfiguration,
Set<String> terms,
String classification,
double fpp) {
String classification) {

super(filterType, filterConfiguration);

this.lowerCaseTerms = new HashSet<>();
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), terms.size(), fpp);
this.bloomFilter = new BloomFilter<>(terms.size());
this.classification = classification;

// Find the max n-gram size. It is equal to the maximum
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package ai.philterd.phileas.model.utils;

import org.apache.commons.codec.digest.MurmurHash3;

import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.BitSet;
import java.util.function.Function;

import static org.apache.commons.codec.digest.MurmurHash3.DEFAULT_SEED;

public class BloomFilter<T> {

private final BitSet bitSet;
private final Function<T, Integer>[] hashFunctions;

public BloomFilter(int size) {
this.bitSet = new BitSet(size);
this.hashFunctions = createHashFunctions();
}

public void put(T element) {
for (final Function<T, Integer> hashFunction : hashFunctions) {
int hash = hashFunction.apply(element);
bitSet.set(Math.abs(hash) % bitSet.size(), true);
}
}

public boolean mightContain(T element) {
for (final Function<T, Integer> hashFunction : hashFunctions) {
int hash = hashFunction.apply(element);
if (!bitSet.get(Math.abs(hash) % bitSet.size())) {
return false;
}
}
return true;
}

private Function<T, Integer>[] createHashFunctions() {

Function<T, Integer>[] functions = new Function[2];

functions[0] = (T element) -> {
final byte[] data = element.toString().getBytes(StandardCharsets.UTF_8);
return MurmurHash3.hash32x86(data, 0, data.length, DEFAULT_SEED);
};

functions[1] = (T element) -> {

try {

final MessageDigest digest = MessageDigest.getInstance("MD5");
byte[] hash = digest.digest(element.toString().getBytes(StandardCharsets.UTF_8));

int hashCode = 0;
for (int i = 0; i < 4; i++) {
hashCode = (hashCode << 8) | (hash[i] & 0xFF);
}

return hashCode;

} catch (NoSuchAlgorithmException e) {
throw new RuntimeException("MD5 algorithm not found", e);
}

};

return functions;

}

}
6 changes: 0 additions & 6 deletions phileas-services/phileas-services-alerts/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
11 changes: 0 additions & 11 deletions phileas-services/phileas-services-anonymization/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,11 @@
<artifactId>generex</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>ai.grakn</groupId>
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
6 changes: 0 additions & 6 deletions phileas-services/phileas-services-disambiguation/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
1 change: 0 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@
<dropwizard-metrics-prometheus.version>1.13.1</dropwizard-metrics-prometheus.version>
<equals.verifier.version>3.17.3</equals.verifier.version>
<ff3.version>1.0.4</ff3.version>
<guava.version>33.3.1-jre</guava.version>
<hapi.fhir.version>4.2.0</hapi.fhir.version>
<gson.version>2.11.0</gson.version>
<icu4j.version>67.1</icu4j.version>
Expand Down

0 comments on commit 2150d10

Please sign in to comment.