paper_hashtag_federation/talk-slides.tex

579 lines
20 KiB
TeX

% $Header$
% use lualatex for compilation
\documentclass[aspectratio=169,navbar=false]{beamer}
% This file is a solution template for:
% - Talk at a conference/colloquium.
% - Talk length is about 20min.
% - Style is ornate.
% Copyright 2004 by Till Tantau <tantau@users.sourceforge.net>.
%
% In principle, this file can be redistributed and/or modified under
% the terms of the GNU Public License, version 2.
%
% However, this file is supposed to be a template to be modified
% for your own needs. For this reason, if you use this file as a
% template and not specifically distribute it as part of a another
% package/program, I grant the extra permission to freely copy and
% modify this file as you see fit and even to delete this copyright
% notice.
\mode<presentation>
{
\usetheme[cd2018,noddc,darktitlepage]{tud}
\usecolortheme{tud}
% or ...
%\setbeamercovered{transparent}
% or whatever (possibly just delete it)
}
% notes on 2nd screen:
\usepackage{pgfpages}
\setbeameroption{show notes on second screen}
\usepackage[british]{babel}
% or whatever
\usepackage[backend=biber, sorting=none]{biblatex}
\addbibresource{literature.bib}
\usepackage{ccicons}
\usepackage{wrapfig}
\usepackage{ifluatex}
\ifluatex
\usepackage{fontspec}
%\setmainfont{TeX Gyre Pagella}
%\RequirePackage{unicode-math}
%\setmathfont{XITS Math}
%\setmainfont{Open Sans}
%\setsansfont{Open Sans}
%\setmathfont[range={it}]{Open Sans:style=Italic}
%\setmathfont[range={it}]{Open Sans}
\else
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
% Or whatever. Note that the encoding and the font should match. If T1
% does not look nice, try deleting the line with the fontenc.
\fi
\title[Decentralised Hashtag Federation] % (optional, use only with long paper titles)
{Decentralised Hashtag Search and Subscription
for Federated Social Networks}
\author
{Trolli Schmittlauch}
% - Give the names in the same order as the appear in the paper.
% - Use the \inst{?} command only if the authors have different
% affiliation.
\institute[] % (optional, but mostly needed)
{
Department of Computer Science\\
Technical University Dresden
}
\date[APConf 2019] % (optional, should be abbreviation of conference name)
{ActivityPubConf 2019}
\datecity{Prague}
% - Either use conference name or its abbreviation.
% - Not really informative to the audience, more for people (including
% yourself) who are reading the slides online
%\subject{Privacy}
% This is only inserted into the PDF information catalog. Can be left
% out.
% If you have a file called "university-logo-filename.xxx", where xxx
% is a graphic format that can be processed by latex or pdflatex,
% resp., then you can add a logo as follows:
% \pgfdeclareimage[height=0.5cm]{university-logo}{university-logo-filename}
% \logo{\pgfuseimage{university-logo}}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
%\AtBeginSubsection[]
%{
% \begin{frame}<beamer>{Outline}
% \tableofcontents[currentsection,currentsubsection]
% \end{frame}
%}
% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command:
%\beamerdefaultoverlayspecification{<+->}
\begin{document}
\maketitle
\note{introduce myself:\\
usually go by schmittlauch on the Internet\\
student of Computer Science @ TU Dresden\\
interest in federated systems and unusual social networks\\
presenting my work on a student research paper from this year}
\begin{frame}{Outline}
\tableofcontents
% You might wish to add the option [pausesections]
\end{frame}
% Structuring a talk is a difficult task and the following structure
% may not be suitable. Here are some rules that apply for this
% solution:
% - Exactly two or three sections (other than the summary).
% - At *most* three subsections per section.
% - Talk about 30s to 2min per frame. So there should be between about
% 15 and 30 frames, all told.
% - A conference audience is likely to know very little of what you
% are going to talk about. So *simplify*!
% - In a 20min talk, getting the main ideas across is hard
% enough. Leave out details, even if it means being less precise than
% you think necessary.
% - If you omit details that are vital to the proof/implementation,
% just say so once. Everybody will be happy with that.
\section{Motivation}
\begin{frame}{Welcome to ActivityPubConf!}{Motivation}
\only<1>{
\includegraphics[width=\textwidth]{figures/toot_nohashtags.png}
\note{Who has been posting about this Conference?}
}
\only<2>{
\includegraphics[width=\textwidth]{figures/toot_hashtags.png}
\note{And who used \#ActivityPubConf?}
}
\end{frame}
\subsection{Importance of \#Hashtags}
\begin{frame}{Importance of \#Hashtags}{}
Hashtags are used for marking posts about certain topics or events:
\note{mark topics of posts, make them discoverable by content. No decentralised full text search in Fediverse (centralised search engines)}
\begin{columns}
\begin{column}{0.47\textwidth}
\begin{itemize}
\item<1-> \textbf{events}: \#ActivityPubConf, \#CCCamp19
\item<2-> \textbf{political topics}: \#SaveTheInternet
\item<3-> \textbf{general topics}: \#mastoadmin, \#Tusky
\item<4-> \textbf{ongoing demonstrations}: \#GeziPark, \#WomensMarch
\item<5-> \textbf{social movements}: \#MeToo
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{overlayarea}{\textwidth}{0.6\paperheight}
\center
\only<1>{
\includegraphics[width=\textwidth]{figures/APConfLogo.png}
}
\only<2>{
\includegraphics[height=0.58\paperheight]{figures/hashtag_savetheinternet.jpg}\\
\tiny{\href{https://www.flickr.com/photos/8183946@N05/14733648892}{"Obama in the Backseat: Rally to Save the Internet"} by \href{https://www.flickr.com/photos/8183946@N05}{Free Press Pics} is licensed under \href{https://creativecommons.org/licenses/by-sa/2.0/?ref=ccsearch&atype=rich}{CC BY-SA 2.0} \ccbysa}
}
\only<3>{
\includegraphics[height=0.6\paperheight]{figures/Elephant_Friend_(Greeting).png}
}
\only<4>{
\includegraphics[width=\textwidth]{figures/hashtag_gezipark.jpg}\\
\tiny{\ccby \href{https://creativecommons.org/licenses/by/4.0/}{CC BY 4.0 International} by \href{https://metronaut.de}{Metronaut}}
}
\only<5>{
\includegraphics[height=0.6\paperheight]{figures/hashtag_metoo.jpg}\\
\tiny{\href{https://www.flickr.com/photos/50612692@N04/28039368079}{"IMG\_4263"} by \href{https://www.flickr.com/photos/50612692@N04}{GGAADD} is licensed under \href{https://creativecommons.org/licenses/by-sa/2.0/?ref=ccsearch&atype=rich}{CC BY-SA 2.0} \ccbysa}
}
\end{overlayarea}
\end{column}
\end{columns}
\end{frame}
\subsection{State of Hashtags in the Fediverse}
\begin{frame}{State of Hashtags on the Fediverse}{}
{\center \Large Hashtags are used in the Fediverse}
\pause
\vspace{2em}
{\large But do they behave as expected?}
\end{frame}
\begin{frame}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[height=0.65\paperheight]{figures/{activitypubconf_toot.matereal.eu}.png}
\caption{\#activitypubconf on the single-user instance \textit{toot.matereal.eu}}
\end{figure}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[height=0.65\paperheight]{figures/{activitypubconf_mastodon.social}.png}
\caption{\#activitypubconf on the large instance \textit{mastodon.social}}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{State of Hashtags on the Fediverse}{Fragmentation}
\begin{itemize}
\item fragmented view on hashtag posts depending on user's instance\note[item]{view depends on users instance}
\item hashtag search only on locally known posts\note[item]{local posts}
\item Result: incentive to cluster on large nodes \(\Leftarrow\) centralisation\note[item]{cluster incentive}
\end{itemize}
\end{frame}
\begin{frame}{Reason}{Push-Federation}
\begin{columns}
\begin{column}{0.55\textwidth}
\includegraphics[height=0.6\paperheight]{figures/push_federation.pdf}
\end{column}
\note{example scenario with 4 instances}
\begin{column}{0.45\textwidth}
\begin{itemize}
\only<1>{\item subscription to \texttt{@alice@cyber.space} by contacting instance \texttt{cyber.space}}
\only<2>{\item all future posts by Alice are delivered to instances of subscribers, but \textit{not} instances without any subscriber\note[item]{cyber.space may not even be aware of existence of other instances}}
\only<3>{\item other ways for posts to reach an instance:\\ boosts, thread resolution\note[item]{posts can disseminate through other means}}
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Current Solutions}
\begin{itemize}
\item Mastodon PubRelay or Pleroma lite-pub relay:
\begin{itemize}
\item centralised actor relaying all incoming posts
\item single point of failure, which relay to choose?
\item relaying all incoming posts \(\Rightarrow\) huge load on small instances
\item only access to posts sent after initial subscription
\end{itemize}
\item Diaspora* SocialRelay
\begin{itemize}
\item similar, but allows subscribing to certain tags only
\end{itemize}
\note{relays\\
centralised actors, single point\\
which one to choose?\\
all posts -> overload\\
after subscription}
\end{itemize}
\end{frame}
\section{System Architecture}
\begin{frame}{System Architecture}{Goals}
\begin{itemize}
\item \textbf{relay \& subscribe}: instances can subscribe to all public posts of a hashtag
\item \textbf{store \& query}: instances can retrieve 1 year of history for a hashtag without subscription
\item fully decentralised, no single point of authority for all tags
\end{itemize}
\note{2 goals: relay \& subscribe; store \& query; fully decentralised}
\end{frame}
\begin{frame}{System Architecture}{adding a DHT backend to the Fediverse}
\note[item]{subscription to all posts of a user possible because there is a single responsible instance}
core idea: distribute responsibility for tags among instances using a \textbf{D}istributed \textbf{H}ash \textbf{T}able, \note[item]{distribute responsibility for posts of a hashtag = relaying \& storage}
based on Chord \cite{stoicaChordScalablePeertopeer2003}
%ToDo for Kolloquium: reason for Chord
\note[item]{DHT: structured P2P networks, self-organising, no central authority}
\note[item]{provides efficient (log N) key-value storage and lookup}
\end{frame}
\begin{frame}{System Architecture}{adding a DHT backend to the fediverse}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{itemize}
\note[item]{common namespace for nodes and lookup keys}
\item calculate hash value of hashtags and node IDs
\item place these hashes onto the same circular name space
\item each node keeps routing table of \(\log \#number\_nodes\) entries\note[item]{routing table entries to distances of powers of 2}
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\includegraphics[height=0.62\paperheight]{figures/finger_table_routing_1.pdf}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{System Architecture}{adding a DHT backend to the fediverse}
\begin{columns}
\begin{column}{0.55\textwidth}
\begin{itemize}
\item responsible node for a key \(k\) is node \(successor(k) = \min_i(k+i) \mod\) keyspace\_size
\item DHT used for iterative lookup of responsible relay/ storage node
\note[item]{iterative lookup of responsible successor node of key}
\item my architecture: keyspace = \(2^{256}\) with 256bit-long IDs
\note[item]{i use keyspace of \(2^{256}\)}
\end{itemize}
\end{column}
\begin{column}{0.45\textwidth}
\includegraphics[height=0.62\paperheight]{figures/finger_table_routing_2.pdf}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Publishing, Relaying and Storage}{lifecycle of posts}
\begin{enumerate}
\item publishing instance looks up responsible relay instance on DHT for each included hashtag\note[item]{calculate hashum of hashtag -> lookup key for DHT}
\item publishing instance sends post to responsible relay instance
\item relay instance looks up responsible storage node on DHT
\item relay instance verifies incoming post's signature, then relays post URI (ID) to all subscribers + storage node\note[item]{only post ID relayed, but not full post content. Reasons: LDSignatures not supported everywhere, deniability \& revocation}
\item subscribing instances can now retrieve the full authenticated post from received post URI
\end{enumerate}
\note[item]{for joining and leaving the DHT see paper}
\end{frame}
\begin{frame}
\note{so far so easy.\\
what hashtags does a certain instance handle? determined by node ID\\
can it bear the load?}
\begin{itemize}
\item node ID determines set of hashtags handled by instance
\item problem: for security reasons, node \textbf{must not} choose their IDs freely
\item Can instances be overloaded by their assigned hashtag posts?
\end{itemize}
\end{frame}
\begin{frame}{Distribution of Posts per Tag}
\includegraphics[width=0.49\textwidth]{statistics/twitter_hashtags_total.png}
~
\includegraphics[width=0.49\textwidth]{statistics/geraspora_hashtags_total.png}
\note{analysis of a 1 month dump of Twitter, Geraspora (Diaspora) and Friendica posts\\
Twitter: 70\% of hashtags used just once\\
note the logarithmic axis!}
distribution of posts per hashtag follows a steep power law
\note{So what if a small node gets several large hashtags? => need for load balancing}
\end{frame}
\begin{frame}{Load Balancing}{of hashtags between nodes}
\begin{itemize}
%ToDo for Kolloquium: reasonf or choosing that algorithm
\item \textit{k-choices} algorithm by Ledlie and Seltzer \cite{ledlieHarvardTechnicalReport2004}
\item a node can represent several \textit{virtual nodes} on the DHT
\item \(\kappa\) possible virtual node IDs: \(ID =\) hash(\(ID' ++ i\))\(, i \in \{0, 1, \dots, \kappa-1\}\)
\item nodes have a \textbf{capacity} and choose set of active IDs according to lowest mismatch of own and neighbour node capacity
\item querying load of potential IDs before joining, periodic re-balancing
\item independent load balancing of relay and storage nodes due to independent DHTs\note[item]{enable different roles ``relay'', ``storage'' to balance independently: 2 DHTs}
\note[item]{a simple simulation on the effectivity of the balancing algorithm can be found in the paper}
\end{itemize}
% for Kolloquium, add simulation result
\end{frame}
\begin{frame}{Redundancy}
\begin{columns}
\begin{column}{0.55\textwidth}
\begin{itemize}
\item redundant assignment of responsibility for hashtag at equal distances on Chord ring, inspired by Harvesf and Blough \cite{harvesfEffectReplicaPlacement2006}\note{resilience against node failure, allows data validation through cross-checking}
\item default redundancy: \(2^2 = 4\), scalable in powers of 2
\item \textbf{relay nodes}: hot standby nodes take over in overload situations (load spikes)
\item \textbf{storage nodes}: overloaded nodes can split stored posts by content hash and double redundancy set
\end{itemize}
\end{column}
\begin{column}{0.45\textwidth}
\includegraphics[width=\textwidth]{figures/redundancy_ring.pdf}
\end{column}
\end{columns}
\end{frame}
\section{Discussion}
\begin{frame}{Discussion}{I need YOUR feedback}
I want feedback from all of you, no matter whether it's from a \textit{\LARGE technical} or from a \textit{\LARGE social perspective}.
\note{architecture just a concept so far\\
before implementations: several open questions}
\end{frame}
\subsection{Social Considerations}
\begin{frame}{Social Considerations}
Do we even want global hashtags in the Fediverse?
\begin{itemize}
\item positive potential (conversation, coordination) vs. negative potential (spam, harrassment)
\note[item]{positive vs negative potential}
\item visibility level: public posts only, unlisted, new level necessary?
\note[item]{visibility levels}
\item relaying post URI only should provide plausible deniability and retractability
\note[item]{retractability should be given}
\end{itemize}
\end{frame}
\subsection{Technical Considerations}
\begin{frame}{Technical Considerations}{instance admins}
\begin{itemize}
\item intended as opt-in, domain-based push federation still better for user subscriptions
\note[item]{optional mechanism, let's keep C2S communication for mobile friendliness and PushFed for simplicity}
\item assumption: instances offer \(5.5\times\) the storage \& \(2.5\times\) the bandwidth of own posts
\note[item]{assumptions: 5.5x storage, 2.5x bandwidth}
\item performance: Can this be implemented efficiently enough to not DDoS popular hashtag nodes?
\note[item]{performance: fetch DDoS of popular tags}
\begin{itemize}
\item batched retrieval of posts from same source
\item exponential backoff retries
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Technical Considerations}{integration into the ActivityPub Fediverse}
\begin{itemize}
\item This architecture is an unimplemented concept so far!
\item integration into ActivityPub ecosystem\note[item]{disclaimer: I'm new to ActivityPub and have no implementation experience}
\begin{itemize}
\item hashtags may be represented as relay actors with own in- \& outbox, addressed in cc
\item relaying to subscribers via SharedInbox
\item idea for addressing: new URI scheme that gets transparently resolved to responsible node's domain via DHT by application proxy\note[item]{application proxy for transparent URI scheme resolving?}
\item signalling of error codes and redundancy factors is needed
\end{itemize}
\item DHT routing communication does not use ActivityPub
\end{itemize}
\end{frame}
\begin{frame}{Technical Considerations}{node ID assignment}
\note[item]{Let's talk about the elephant in the room of ``federated services''}
\centering
\includegraphics<1>[height=0.59\paperheight]{figures/Elephant_Friend_(Greeting).png}
\includegraphics<2,4>[height=0.59\paperheight]{figures/hist_num_single_vs.png}
\note[item]{common DHT attack: Sybil-… = 1 attacker introduces large number of nodes}
\note[item]{sorry to all instance admins, but: CloudFlare behaves like a MITM/ Sybil attacker}
\note[item]{node ID derivation: first 64 bits of IPv6}
\only<2,4>{1st peak: Masto.host, 2nd peak: Cloudflare}
\only<3>{{\ttfamily
\(h_n =\) hash(IPv6\_addr[0,63] ++ vserver)[0,63] \\
++ hash(domain ++ vserver)[0,127] \\
++ hash(IPv6\_addr[0,63] ++ vserver)[64,127]
}
\vspace{2em}
node ID derivation}
\end{frame}
\subsection{Security Considerations}
\begin{frame}{Security Considerations}
\begin{itemize}
\item attacker shall not be able to deliberately gain responsibility for certain hashtags
\begin{itemize}
\item node ID mainly dependant on IPv6 subnet
\end{itemize}
\item attacker shall not introduce arbitrary number of nodes
\begin{itemize}
\item valid domain required for node ID derivation, assumption: domains cost money
\end{itemize}
\note{not perfectly secure, but the best I could think of. Better ideas welcome}
\end{itemize}
\end{frame}
\section{Summary}
\begin{frame}{Summary}
% Keep the summary *very short*.
\begin{itemize}
\item
decentralised architecture for handling posts of the same hashtag:
\begin{itemize}
\item \alert{subscribe to hashtag} and get posts \alert{relay}ed
\item \alert{query stored posts} of a certain hashtag without subscription
\end{itemize}
\item responsibility for hashtag divided among instances using a DHT
\item architecture \alert{balances the load} between nodes and maintains \alert{redundancy}
\item several open questions before implementation
\end{itemize}
% The following outlook is optional.
%\vskip0pt plus.5fill
\end{frame}
% All of the following is optional and typically not needed.
\appendix
\section<presentation>*{\appendixname}
\subsection<presentation>*{For Further Reading}
\begin{frame}
\center\huge{Questions, comments, feedback?}
\includegraphics[height=0.4\paperheight]{figures/qr_paper.png}\\
\large\url{https://git.orlives.de/schmittlauch/paper_hashtag_federation/src/branch/master/paper_hashtag_federation.pdf}
\vspace{2em}
\Large\includegraphics[height=1em]{figures/fediverse_logo.pdf} @schmittlauch@toot.matereal.eu
\end{frame}
\begin{frame}[allowframebreaks]
\frametitle{References}
\printbibliography
\end{frame}
%\includegraphics[height=0.5\textheight]{figures/nomnompingu.png}\tiny\footnote{CC-BY-SA 3.0 by Elektroll}
\end{document}