-
Notifications
You must be signed in to change notification settings - Fork 34
/
archive.sh
28 lines (21 loc) · 1.04 KB
/
archive.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#!/bin/bash
set -euo pipefail
echo "Exporting documents"
python3 -m scripts.mongo_to_jsonl --output-path data/raw_new_docs.jsonl
echo "Cleaning documents"
python3 -m scripts.clean_docs --input-path data/raw_new_docs.jsonl --output-path data/new_docs.jsonl
python3 -m scripts.clean_docs --input-path data/raw_old_docs.jsonl --output-path data/old_docs.jsonl
cp data/old_docs.jsonl data/all_docs.jsonl
cat data/new_docs.jsonl >> data/all_docs.jsonl
echo "Filtering documents"
python3 -m scripts.filter_documents data/all_docs.jsonl data/documents.jsonl
echo "Exporting clusters"
python3 -m scripts.clusters_to_jsonl --output-path data/new_clusters.jsonl
cp data/old_clusters.jsonl data/all_clusters.jsonl
cat data/new_clusters.jsonl >> data/all_clusters.jsonl
echo "Filtering clusters"
python3 -m scripts.filter_posted_clusers data/all_clusters.jsonl data/clusters.jsonl data/documents.jsonl
echo "Packing"
cp channels.json data/channels.json
rm data/nyan_archive.tar.gz
cd data && tar -czvf nyan_archive.tar.gz clusters.jsonl documents.jsonl channels.json