remove broken hashtag counting from Twitter data dump parsing script

This commit is contained in:
Trolli Schmittlauch 2020-07-09 02:21:31 +02:00
parent 97b0694410
commit 44e7d8c3cd

View file

@ -6,7 +6,6 @@
#####################
HISTFILE="./twitter_hashtag_history.csv"
DISTFILE="./twitter_hashtag_numbers.txt"
# extract hashtags and time stamp in parallel
@ -15,7 +14,3 @@ echo "$@" | parallel -d ' ' -j+0 --eta "echo \"parsing {}\"; zcat {} |jq -r 'if
# merge all temporary history files to one
cat tmp_twitter_history_* > ${HISTFILE}
# rm "tmp_twitter_history_*"
# count hashtag occurence
awk -F, '{print tolower($2)}' "${HISTFILE}" | sort | uniq -c > ${DISTFILE}
# let's hope that sort can manage such huge files in memory