remove broken hashtag counting from Twitter data dump parsing script
This commit is contained in:
parent
97b0694410
commit
44e7d8c3cd
|
@ -6,7 +6,6 @@
|
|||
#####################
|
||||
|
||||
HISTFILE="./twitter_hashtag_history.csv"
|
||||
DISTFILE="./twitter_hashtag_numbers.txt"
|
||||
|
||||
# extract hashtags and time stamp in parallel
|
||||
|
||||
|
@ -15,7 +14,3 @@ echo "$@" | parallel -d ' ' -j+0 --eta "echo \"parsing {}\"; zcat {} |jq -r 'if
|
|||
# merge all temporary history files to one
|
||||
cat tmp_twitter_history_* > ${HISTFILE}
|
||||
# rm "tmp_twitter_history_*"
|
||||
|
||||
# count hashtag occurence
|
||||
awk -F, '{print tolower($2)}' "${HISTFILE}" | sort | uniq -c > ${DISTFILE}
|
||||
# let's hope that sort can manage such huge files in memory
|
||||
|
|
Loading…
Reference in a new issue