Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removing guava dependency and adding bloom filter #172

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 5 additions & 6 deletions phileas-core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@
<artifactId>retrofit</artifactId>
<version>${retrofit.version}</version>
</dependency>
<dependency>
<groupId>com.google.code.gson</groupId>
<artifactId>gson</artifactId>
<version>${gson.version}</version>
</dependency>
<dependency>
<groupId>com.squareup.retrofit2</groupId>
<artifactId>converter-gson</artifactId>
Expand Down Expand Up @@ -124,12 +129,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
</dependencies>
</project>
Original file line number Diff line number Diff line change
Expand Up @@ -826,8 +826,7 @@ public List<Filter> getFiltersForPolicy(final Policy policy, final Map<String, M

final String classification = customDictionary.getClassification();

enabledFilters.add(new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration,
terms, classification, phileasConfiguration.bloomFilterFpp()));
enabledFilters.add(new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, terms, classification));

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.services.MetricsService;
import com.google.common.reflect.TypeToken;

import com.google.gson.Gson;
import com.google.gson.reflect.TypeToken;
import okhttp3.Authenticator;
import okhttp3.ConnectionPool;
import okhttp3.Credentials;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ public void filterDictionaryExactMatch() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george", "ted", "Bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived with Bill in California.", attributes);

Expand All @@ -74,7 +74,7 @@ public void filterDictionaryCaseInsensitiveMatch() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived with Bill in California.", attributes);

Expand All @@ -97,7 +97,7 @@ public void filterDictionaryNoMatch() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE, "He lived with Sam in California.", attributes);

Expand All @@ -118,7 +118,7 @@ public void filterDictionaryPhraseMatch1() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george jones", "ted", "bill", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"He lived with george jones in California.", attributes);

Expand All @@ -141,7 +141,7 @@ public void filterDictionaryPhraseMatch2() throws Exception {
.build();

final Set<String> names = new HashSet<>(Arrays.asList("george jones jr", "ted", "bill smith", "john"));
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none", 0.05);
final BloomFilterDictionaryFilter filter = new BloomFilterDictionaryFilter(FilterType.CUSTOM_DICTIONARY, filterConfiguration, names, "none");

final FilterResult filterResult = filter.filter(getPolicy(), "context", "documentid", PIECE,"Bill Smith lived with george jones jr in California.", attributes);

Expand Down
11 changes: 0 additions & 11 deletions phileas-model/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,6 @@
<artifactId>antlr4-runtime</artifactId>
<version>${antlr.version}</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
Expand Down Expand Up @@ -150,12 +145,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,6 @@ public int spanWindowSize() {
return Integer.parseInt(getProperty("span.window.size", "5"));
}

public double bloomFilterFpp() {
return Double.parseDouble(getProperty("filter.fpp", "0.05"));
}

// Caching

public boolean cacheRedisEnabled() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,23 +15,25 @@
*/
package ai.philterd.phileas.model.filter.rules.dictionary;

import com.google.common.hash.BloomFilter;
import com.google.common.hash.Funnels;
import ai.philterd.phileas.model.enums.FilterType;
import ai.philterd.phileas.model.filter.FilterConfiguration;
import ai.philterd.phileas.model.objects.FilterResult;
import ai.philterd.phileas.model.objects.Replacement;
import ai.philterd.phileas.model.objects.Span;
import ai.philterd.phileas.model.policy.Policy;
import ai.philterd.phileas.model.utils.BloomFilter;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;

import java.io.IOException;
import java.nio.charset.Charset;
import java.util.*;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

/**
* A filter that operates on a bloom filter.
Expand All @@ -48,18 +50,16 @@ public class BloomFilterDictionaryFilter extends DictionaryFilter {
* @param filterConfiguration The {@link FilterConfiguration} for the filter.
* @param terms
* @param classification
* @param fpp
*/
public BloomFilterDictionaryFilter(FilterType filterType,
FilterConfiguration filterConfiguration,
Set<String> terms,
String classification,
double fpp) {
String classification) {

super(filterType, filterConfiguration);

this.lowerCaseTerms = new HashSet<>();
this.bloomFilter = BloomFilter.create(Funnels.stringFunnel(Charset.defaultCharset()), terms.size(), fpp);
this.bloomFilter = new BloomFilter<>(terms.size());
this.classification = classification;

// Find the max n-gram size. It is equal to the maximum
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
package ai.philterd.phileas.model.utils;

import org.apache.commons.codec.digest.MurmurHash3;

import java.nio.charset.StandardCharsets;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.BitSet;
import java.util.function.Function;

import static org.apache.commons.codec.digest.MurmurHash3.DEFAULT_SEED;

public class BloomFilter<T> {

private final BitSet bitSet;
private final Function<T, Integer>[] hashFunctions;

public BloomFilter(int size) {
this.bitSet = new BitSet(size);
this.hashFunctions = createHashFunctions();
}

public void put(T element) {
for (final Function<T, Integer> hashFunction : hashFunctions) {
int hash = hashFunction.apply(element);
bitSet.set(Math.abs(hash) % bitSet.size(), true);
}
}

public boolean mightContain(T element) {
for (final Function<T, Integer> hashFunction : hashFunctions) {
int hash = hashFunction.apply(element);
if (!bitSet.get(Math.abs(hash) % bitSet.size())) {
return false;
}
}
return true;
}

private Function<T, Integer>[] createHashFunctions() {

Function<T, Integer>[] functions = new Function[2];

functions[0] = (T element) -> {
final byte[] data = element.toString().getBytes(StandardCharsets.UTF_8);
return MurmurHash3.hash32x86(data, 0, data.length, DEFAULT_SEED);
};

functions[1] = (T element) -> {

try {

final MessageDigest digest = MessageDigest.getInstance("MD5");
byte[] hash = digest.digest(element.toString().getBytes(StandardCharsets.UTF_8));

int hashCode = 0;
for (int i = 0; i < 4; i++) {
hashCode = (hashCode << 8) | (hash[i] & 0xFF);
}

return hashCode;

} catch (NoSuchAlgorithmException e) {
throw new RuntimeException("MD5 algorithm not found", e);
}

};

return functions;

}

}
6 changes: 0 additions & 6 deletions phileas-services/phileas-services-alerts/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
11 changes: 0 additions & 11 deletions phileas-services/phileas-services-anonymization/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -28,22 +28,11 @@
<artifactId>generex</artifactId>
<version>1.0.2</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>${guava.version}</version>
</dependency>
<dependency>
<groupId>ai.grakn</groupId>
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
6 changes: 0 additions & 6 deletions phileas-services/phileas-services-disambiguation/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@
<artifactId>redis-mock</artifactId>
<version>${redis-mock.version}</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.junit.jupiter</groupId>
Expand Down
1 change: 0 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@
<dropwizard-metrics-prometheus.version>1.13.1</dropwizard-metrics-prometheus.version>
<equals.verifier.version>3.17.3</equals.verifier.version>
<ff3.version>1.0.4</ff3.version>
<guava.version>33.3.1-jre</guava.version>
<hapi.fhir.version>4.2.0</hapi.fhir.version>
<gson.version>2.11.0</gson.version>
<icu4j.version>67.1</icu4j.version>
Expand Down
Loading