paper_hashtag_federation/ID_debug.py
Trolli Schmittlauch 2c22aaede5 import finished university paper & pseudonymise
- dropping all history from the university paper in case it contains
sensitive data
2019-08-05 23:59:43 +02:00

110 lines
3.5 KiB
Python
Executable file

#!/usr/bin/env python3
import numpy as np
from matplotlib import pyplot as plt, rcParams
import csv
import hashlib
from ipaddress import IPv6Address, AddressValueError
from collections import deque
from itertools import chain
from operator import itemgetter
from gmpy2 import mpfr, get_context, cos as gcos, sin as gsin, ceil as gceil
ID_BITS = 256
# set up precision to 256 significant bits
# reason: 256-bit IDs are converted to a (mpfr) float
# before division
get_context().precision = ID_BITS
# radius of circle representing namespace
RAD = 50
C = np.array((RAD+1, RAD+1), dtype=object) # center of circle
MAX_ID = mpfr(2**ID_BITS-1)
def calc_nodeID(hostname: str, ipaddr: str, vserver: int = 0) -> bytes:
try:
ip = IPv6Address(ipaddr)
except AddressValueError:
print("invalid IP addr", ipaddr)
raise ValueError
# represent vserver number as bytes, in as many bytes as needed
# for representing that number
vserver_bytes = vserver.to_bytes(int(np.ceil(vserver.bit_length()/8)), 'big')
# extract the bytes denoting the max. /64 bit network part
net_bytes = int(ip).to_bytes(16, 'big')[0:7]
# domain name is converted to canonical punycode representation
servername_bytes = (hostname + str(vserver)).encode("idna")
net_hash = hashlib.shake_128(net_bytes + vserver_bytes).digest(16)
vservername_hash = hashlib.shake_128(servername_bytes + vserver_bytes).digest(16)
# node IDs format:
# hash(net + vserver_num_in_bytes)[0,63] +
# hash(servername + vserver_num_in_bytes)[0,127] +
# hash(net + vserver_num_in_bytes)[64,127]
nID = net_hash[:8] + vservername_hash + net_hash[8:]
return (int.from_bytes(nID, 'big'), ipaddr, hostname)
def calc_histogram(data, bins, ofile, end=None, start=0):
sdata = deque(sorted(data, key=itemgetter(0)))
if not end:
end = sdata[-1][0]
binsize = gceil((mpfr(end) - mpfr(start)) / bins)
binlimit = start + binsize
histx = []
histy = []
histfile = open(ofile, "wt")
while start < end:
binlimit = start + binsize
count = 0
thisbin = []
try:
while start <= sdata[0][0] and sdata[0][0] < binlimit:
count += 1
thisbin.append(sdata.popleft())
except IndexError:
pass
# plot in the center of the bin
histx.append(int(start + 0.5 * binsize))
histy.append(count)
print("================", start, count, "================", sep="\n", file=histfile)
for inst in thisbin:
print(inst[1], inst[2], inst[0], sep=", ", file=histfile)
print("\n", file=histfile)
start = binlimit
return (histx, histy)
def write_gnuplot_histdata(data, filename):
with open(filename, "tw") as datafile:
for x, y in zip(*data):
print(x, y, sep="\t", file=datafile)
def histogram_plots():
with open("instance_scanner/instance_ips.csv", newline='') as csvfile:
instancereader = csv.reader(csvfile)
# histogram plot of distribution of vserver0 IDs
# using a set comprehension because instance data contains some
# instances twice, once with canonicalized domain name and once
# with unicode domain name -> remove duplicate IDs
ids_single_vserver = sorted({
calc_nodeID(hn, eval(ips)[0])
for hn, ips in instancereader}, key=itemgetter(0))
calc_histogram(ids_single_vserver, 256, end=MAX_ID, ofile="figures/debug_num_single_vs.dat")
def main():
histogram_plots()
if __name__ == "__main__":
main()