paper_hashtag_federation/talk-slides.tex

564 lines
18 KiB
TeX

% $Header$
% use lualatex for compilation
\documentclass[aspectratio=169,navbar=false]{beamer}
% This file is a solution template for:
% - Talk at a conference/colloquium.
% - Talk length is about 20min.
% - Style is ornate.
% Copyright 2004 by Till Tantau <tantau@users.sourceforge.net>.
%
% In principle, this file can be redistributed and/or modified under
% the terms of the GNU Public License, version 2.
%
% However, this file is supposed to be a template to be modified
% for your own needs. For this reason, if you use this file as a
% template and not specifically distribute it as part of a another
% package/program, I grant the extra permission to freely copy and
% modify this file as you see fit and even to delete this copyright
% notice.
\mode<presentation>
{
\usetheme[cd2018,noddc,darktitlepage]{tud}
\usecolortheme{tud}
% or ...
%\setbeamercovered{transparent}
% or whatever (possibly just delete it)
}
% notes on 2nd screen:
\usepackage{pgfpages}
\setbeameroption{show notes on second screen}
\usepackage[british]{babel}
% or whatever
\usepackage[backend=biber, sorting=none]{biblatex}
\usepackage{ccicons}
\usepackage{wrapfig}
\usepackage{ifluatex}
\ifluatex
\usepackage{fontspec}
%\setmainfont{TeX Gyre Pagella}
%\RequirePackage{unicode-math}
%\setmathfont{XITS Math}
%\setmainfont{Open Sans}
%\setsansfont{Open Sans}
%\setmathfont[range={it}]{Open Sans:style=Italic}
%\setmathfont[range={it}]{Open Sans}
\else
\usepackage[T1]{fontenc}
\usepackage[utf8]{inputenc}
% Or whatever. Note that the encoding and the font should match. If T1
% does not look nice, try deleting the line with the fontenc.
\fi
\title[Decentralised Hashtag Federation] % (optional, use only with long paper titles)
{Decentralised Hashtag Search and Subscription
for Federated Social Networks}
\author
{Trolli Schmittlauch}
% - Give the names in the same order as the appear in the paper.
% - Use the \inst{?} command only if the authors have different
% affiliation.
\institute[] % (optional, but mostly needed)
{
Department of Computer Science\\
Technical University Dresden
}
\date[APConf 2019] % (optional, should be abbreviation of conference name)
{ActivityPubConf 2019}
\datecity{Prague}
% - Either use conference name or its abbreviation.
% - Not really informative to the audience, more for people (including
% yourself) who are reading the slides online
%\subject{Privacy}
% This is only inserted into the PDF information catalog. Can be left
% out.
% If you have a file called "university-logo-filename.xxx", where xxx
% is a graphic format that can be processed by latex or pdflatex,
% resp., then you can add a logo as follows:
% \pgfdeclareimage[height=0.5cm]{university-logo}{university-logo-filename}
% \logo{\pgfuseimage{university-logo}}
% Delete this, if you do not want the table of contents to pop up at
% the beginning of each subsection:
%\AtBeginSubsection[]
%{
% \begin{frame}<beamer>{Outline}
% \tableofcontents[currentsection,currentsubsection]
% \end{frame}
%}
% If you wish to uncover everything in a step-wise fashion, uncomment
% the following command:
%\beamerdefaultoverlayspecification{<+->}
\begin{document}
\maketitle
\note{introduce myself:\\
known as schmittlauch on the Internet\\
student of Computer Science @ TU Dresden\\
interest in federated systems and unusual social networks\\
presenting my work on a study paper from this year}
\begin{frame}{Outline}
\tableofcontents
% You might wish to add the option [pausesections]
\end{frame}
% Structuring a talk is a difficult task and the following structure
% may not be suitable. Here are some rules that apply for this
% solution:
% - Exactly two or three sections (other than the summary).
% - At *most* three subsections per section.
% - Talk about 30s to 2min per frame. So there should be between about
% 15 and 30 frames, all told.
% - A conference audience is likely to know very little of what you
% are going to talk about. So *simplify*!
% - In a 20min talk, getting the main ideas across is hard
% enough. Leave out details, even if it means being less precise than
% you think necessary.
% - If you omit details that are vital to the proof/implementation,
% just say so once. Everybody will be happy with that.
\section{Motivation}
\begin{frame}{Welcome to ActivityPubConf!}{Motivation}
\only<1>{
\includegraphics[width=\textwidth]{figures/toot_nohashtags.png}
\note{Who has been posting about this Conference?}
}
\only<2>{
\includegraphics[width=\textwidth]{figures/toot_hashtags.png}
\note{And who used \#ActivityPubConf?}
}
\end{frame}
\subsection{Importance of \#Hashtags}
\begin{frame}{Importance of \#Hashtags}{}
Hashtags are used for marking posts about certain topics or events:
\note{mark topics of posts, make them discoverable by content. No full text search in fediverse}
\begin{columns}
\begin{column}{0.47\textwidth}
\begin{itemize}
\item<1-> \textbf{events}: \#ActivityPubConf, \#CCCamp19
\item<2-> \textbf{political topics}: \#SaveTheInternet
\item<3-> \textbf{general topics}: \#mastoadmin, \#Tusky
\item<4-> \textbf{ongoing demonstrations}: \#GeziPark, \#WomensMarch
\item<5-> \textbf{social movements}: \#MeToo
\end{itemize}
\end{column}
\begin{column}{0.5\textwidth}
\begin{overlayarea}{\textwidth}{0.6\paperheight}
\center
\only<1>{
\includegraphics[width=\textwidth]{figures/APConfLogo.png}
}
\only<2>{
\includegraphics[height=0.58\paperheight]{figures/hashtag_savetheinternet.jpg}\\
\tiny{\href{https://www.flickr.com/photos/8183946@N05/14733648892}{"Obama in the Backseat: Rally to Save the Internet"} by \href{https://www.flickr.com/photos/8183946@N05}{Free Press Pics} is licensed under \href{https://creativecommons.org/licenses/by-sa/2.0/?ref=ccsearch&atype=rich}{CC BY-SA 2.0} \ccbysa}
}
\only<3>{
\includegraphics[height=0.6\paperheight]{figures/Elephant_Friend_(Greeting).png}
}
\only<4>{
\includegraphics[width=\textwidth]{figures/hashtag_gezipark.jpg}
}
\only<5>{
\includegraphics[height=0.6\paperheight]{figures/hashtag_metoo.jpg}\\
\tiny{\href{https://www.flickr.com/photos/50612692@N04/28039368079}{"IMG\_4263"} by \href{https://www.flickr.com/photos/50612692@N04}{GGAADD} is licensed under \href{https://creativecommons.org/licenses/by-sa/2.0/?ref=ccsearch&atype=rich}{CC BY-SA 2.0} \ccbysa}
}
\end{overlayarea}
\end{column}
\end{columns}
\end{frame}
\subsection{State of Hashtags in the Fediverse}
\begin{frame}{State of Hashtags on the Fediverse}{}
{\center \Large Hashtags are used in the fediverse}
\pause
\vspace{2em}
{\large But do they behave as expected?}
\end{frame}
\begin{frame}
\begin{columns}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[height=0.65\paperheight]{figures/{activitypubconf_toot.matereal.eu}.png}
\caption{\#activitypubconf on the single-user instance \textit{toot.matereal.eu}}
\end{figure}
\end{column}
\begin{column}{0.5\textwidth}
\begin{figure}
\includegraphics[height=0.65\paperheight]{figures/{activitypubconf_mastodon.social}.png}
\caption{\#activitypubconf on the large instance \textit{mastodon.social}}
\end{figure}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{State of Hashtags on the Fediverse}{Fragmentation}
\begin{itemize}
\item fragmented view on hashtag posts depending on user's instance
\item hashtag search only on locally known posts
\item Result: incentive to cluster on large nodes \(\Leftarrow\) centralisation
\end{itemize}
\end{frame}
\begin{frame}{Reason}{Push-Federation}
\begin{columns}
\begin{column}{0.55\textwidth}
\includegraphics[height=0.6\paperheight]{figures/push_federation.pdf}
\end{column}
\begin{column}{0.45\textwidth}
\begin{itemize}
\only<1>{\item subscription to \texttt{@alice@cyber.space} by contacting instance \texttt{cyber.space}}
\only<2>{\item all future posts by alice are delivered to instances of subscribers, but \textit{not} instances without any subscriber}
\only<3>{\item other ways for posts to reach an instance:\\ boosts, thread resolution}
\end{itemize}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Current Solutions}
\begin{itemize}
\item Mastodon PubRelay or Pleroma lite-pub relay:
\begin{itemize}
\item centralised actor relaying all incoming posts
\item single point of failure, which relay to choose?
\item relaying all incoming posts \(\Rightarrow\) huge load on small instances
\item only access to posts sent after initial subscribtion
\end{itemize}
\item Diaspora* SocialRelay
\begin{itemize}
\item similar, but allows subscribing to certain tags only
\end{itemize}
\end{itemize}
\end{frame}
\section{System Architecture}
\begin{frame}{System Architecture}{Goals}
\begin{itemize}
\item \textbf{relay \& subscribe}: instances can subscribe to all public posts of a hashtag
\item \textbf{store \& query}: instances can retrieve 1 year of history for a hashtag without subscription
\item fully decentralised, no single point of authority for all tags
\end{itemize}
\end{frame}
\begin{frame}{System Architecture}{adding a DHT backend to the fediverse}
core idea: distribute responsibility for tags among instances using a \textbf{D}istributed \textbf{H}ash \textbf{T}able, \note{distribute responsibility for posts of a hashtag = relaying \& storage}
based on Chord
\note[item]{DHT: structured P2P networks providing efficient (log N) key-value storage and lookup}
\note[item]{self-organising, no central authority}
\end{frame}
\begin{frame}{System Architecture}{adding a DHT backend to the fediverse}
\begin{columns}
\begin{column}{0.4\textwidth}
\begin{itemize}
\item calculate hash value of keys and node IDs
\item place these hashes onto the same circular name space
\item each node keeps routing table of \(\log \#number\_nodes\) entries\note[item]{joining and leaving covered in paper}
\end{itemize}
\end{column}
\begin{column}{0.6\textwidth}
\includegraphics[height=0.62\paperheight]{figures/finger_table_routing_1.pdf}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{System Architecture}{adding a DHT backend to the fediverse}
\begin{columns}
\begin{column}{0.55\textwidth}
\begin{itemize}
\item next nodeID \(\geq\) \texttt{hash(hashtag)} (mod keyspace size) is responsible for handling posts containing \texttt{hashtag}
\item DHT used for iterative lookup of responsible relay/ storage node
\end{itemize}
\end{column}
\begin{column}{0.45\textwidth}
\includegraphics[height=0.62\paperheight]{figures/finger_table_routing_2.pdf}
\end{column}
\end{columns}
\end{frame}
\begin{frame}{Publishing, Relaying and Storage}{lifecycle of posts}
\begin{enumerate}
\item publishing instance looks up responsible relay instance on DHT for each included hashtag
\item publishing instance sends post to responsible relay instance
\item relay instance looks up responsible storage node on DHT
\item relay instance verifies incoming post's signature, then relays post URI (ID) to all subscribers + storage node\note[item]{only post ID relayed, but not full post content. Reasons: LDSignatures not supported everywhere, deniability \& revocation}
\item subscribing instances can now retrieve the full authenticated post from received post URI
\end{enumerate}
\note[item]{for joining and leaving the DHT see paper}
\end{frame}
\begin{frame}{Publishing, Relaying and Storage}
\begin{itemize}
\item separate DHTs for relay and storage instances
\item all actions after DHT lookup supposed to be done using ActivityPub via HTTPS
\item subscription to hashtags/ querying posts is done at the responsible instance
\end{itemize}
\end{frame}
\begin{frame}
\note{so far so easy. But load distribution issues}
\begin{itemize}
\item node ID determines set of hashtags handled by instance
\item problem: for security reasons, node \textbf{must not} choose their IDs freely
\item Can instances be overloaded by their assigned hashtag posts?
\end{itemize}
\end{frame}
\begin{frame}{Distribution of Posts per Tag}
\includegraphics[width=0.49\textwidth]{statistics/twitter_hashtags_total.png}
~
\includegraphics[width=0.49\textwidth]{statistics/geraspora_hashtags_total.png}
\note{analysis of a 1 month dump of Twitter, Geraspora (Diaspora) and Friendica posts\\
Twitter: 70\% of posts used just once\\
note the logarithmic axis!}
distribution of posts per hashtag follows a steep power law
\note{So what if a small node gets several large hashtags? => need for load balancing}
\end{frame}
\begin{frame}{Load Balancing}{of hashtags between nodes}
\begin{itemize}
\item \textit{k-choices} algorithm by Ledlie and Seltzer
\item each node can choose from \(\kappa\) possible IDs
\item nodes have a \textbf{capacity} and choose set of active IDs according to lowest mismatch of own and neighbour node capacity
\item querying load of potential IDs before joining, periodic re-balancing
\item independent load balancing of relay and storage nodes due to independent DHTs
\note[item]{a simple simulation on the effectivity of the balancing algorithm can be found in the paper}
\end{itemize}
% for Kolloquium, add simulation result
\end{frame}
\begin{frame}{Redundancy}
\begin{columns}
\begin{column}{0.55\textwidth}
\begin{itemize}
\item store redundant copies of hashtag data at equal distances on Chord ring\note{resilience against node failure, allows data validation through cross-checking}
\item default redundancy: \(2^2 = 4\), scalable in powers of 2
\item \textbf{relay nodes}: hot standby nodes take over in overload situations (load spikes)
\item \textbf{storage nodes}: overloaded nodes can split stored posts by content hash and double redundancy set
\end{itemize}
\end{column}
\begin{column}{0.45\textwidth}
\includegraphics[width=\textwidth]{figures/redundancy_ring.pdf}
\end{column}
\end{columns}
\end{frame}
\section{Discussion}
\begin{frame}{Discussion}{I need YOUR feedback}
I want feedback from all of you, no matter whether it's from a \textit{\LARGE technical} or from a \textit{\LARGE social perspective}.
\end{frame}
\subsection{Social Considerations}
\begin{frame}{Social Considerations}
Do we even want global hashtags in the Fediverse?
\begin{itemize}
\item positive potential (conversation, coordination) vs. negative potential (spam, harrasment)
\item visibility level: public posts only, unlisted, new level necessary?
\item relaying post URI only should provide plausible deniability
\item Does this circumvent instance-blocks and is this bad?
\end{itemize}
\end{frame}
\subsection{Technical Considerations}
\begin{frame}{Technical Considerations}{instance admins}
\begin{itemize}
\item intended as opt-in, domain-based push federation still better for user subscriptions
\item assumption: instances offer \(5.5\times\) the storage \& \(2.5\times\) the bandwith of own posts
\item performance: Can this be implemented efficiently enough to not DDoS popular hashtag nodes?
\begin{itemize}
\item batched retrieval of posts from same source
\item exponential backoff retries
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}{Technical Considerations}{integration into the ActivityPub Fediverse}
\begin{itemize}
\item This architecture is an unimplemented concept so far!
\item integration into ActivityPub ecosystem\note[item]{disclaimer: I'm new to ActivityPub and have no implementation experience}
\begin{itemize}
\item hashtags may be represented as relay actors with own in- \& outbox, addressed in cc
\item relaying to subscribers via SharedInbox
\item idea for addressing: new URI scheme that gets transparently resolved to responsible node's domain via DHT\note[item]{application proxy for transparent URI scheme resolving?}
\item signalling of error codes and redundancy factors is needed
\end{itemize}
\item DHT routing communication does not use ActivityPub
\end{itemize}
\end{frame}
\begin{frame}{Technical Considerations}{node ID assignment}
\note[item]{Let's talk about the elephant in the room of ``federated services''}
\centering
\includegraphics<1>[height=0.59\paperheight]{figures/Elephant_Friend_(Greeting).png}
\includegraphics<2,4>[height=0.59\paperheight]{figures/hist_num_single_vs.png}
\note[item]{sorry to all instance admins, but: CloudFlare behaves like a MITM/ Sybil attacker}
\only<2,4>{1st peak: Masto.host, 2nd peak: Cloudflare}
\only<3>{{\ttfamily
\(h_n =\) hash(IPv6\_addr[0,63] ++ vserver)[0,63] \\
++ hash(domain ++ vserver)[0,127] \\
++ hash(IPv6\_addr[0,63] ++ vserver)[64,127]
}
\vspace{2em}
node ID derivation}
\end{frame}
\subsection{Security Considerations}
\begin{frame}{Security Considerations}
\begin{itemize}
\item attacker shall not be able to deliberately gain responsibility for certain hashtags
\begin{itemize}
\item node ID mainly dependant on IPv6 subnet
\end{itemize}
\item attacker shall not introduce arbitrary number of nodes
\begin{itemize}
\item valid domain required for node ID derivation, assumption: domains cost money
\end{itemize}
\end{itemize}
\end{frame}
\section{Summary}
\begin{frame}{Summary}
% Keep the summary *very short*.
\begin{itemize}
\item
decentralised architecture for handling posts of the same hashtag:
\begin{itemize}
\item \alert{subscribe to hashtag} and get posts \alert{relay}ed
\item \alert{query stored posts} of a certain hashtag without subscription
\end{itemize}
\item responsibility for hashtag divided among instances using a DHT
\item architecture \alert{balances the load} between nodes and maintains \alert{redundancy}
\item several open questions before implementation
\end{itemize}
% The following outlook is optional.
%\vskip0pt plus.5fill
\end{frame}
% All of the following is optional and typically not needed.
\appendix
\section<presentation>*{\appendixname}
\subsection<presentation>*{For Further Reading}
\begin{frame}[allowframebreaks]
\frametitle{References}
%\bibliography{literature}
\end{frame}
\begin{frame}
\center\huge{Questions, comments, feedback?}
\includegraphics[height=0.4\paperheight]{figures/qr_paper.png}\\
\large\url{https://git.orlives.de/schmittlauch/paper_hashtag_federation/src/branch/master/paper_hashtag_federation.pdf}
\end{frame}
\begin{frame}
%\includegraphics[height=0.5\textheight]{figures/nomnompingu.png}\tiny\footnote{CC-BY-SA 3.0 by Elektroll}
\end{frame}
\end{document}