|
|
|
@ -6,7 +6,6 @@
|
|
|
|
|
#####################
|
|
|
|
|
|
|
|
|
|
HISTFILE="./twitter_hashtag_history.csv"
|
|
|
|
|
DISTFILE="./twitter_hashtag_numbers.txt"
|
|
|
|
|
|
|
|
|
|
# extract hashtags and time stamp in parallel
|
|
|
|
|
|
|
|
|
@ -15,7 +14,3 @@ echo "$@" | parallel -d ' ' -j+0 --eta "echo \"parsing {}\"; zcat {} |jq -r 'if
|
|
|
|
|
# merge all temporary history files to one
|
|
|
|
|
cat tmp_twitter_history_* > ${HISTFILE}
|
|
|
|
|
# rm "tmp_twitter_history_*"
|
|
|
|
|
|
|
|
|
|
# count hashtag occurence
|
|
|
|
|
awk -F, '{print tolower($2)}' "${HISTFILE}" | sort | uniq -c > ${DISTFILE}
|
|
|
|
|
# let's hope that sort can manage such huge files in memory
|
|
|
|
|