Skip to content
This repository has been archived by the owner on May 4, 2021. It is now read-only.

Ticket30406 refactor header constants #357

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
71 changes: 71 additions & 0 deletions sbws/lib/stem_bandwidth_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""Stem's Bandwidth file HEADER_ATTR part as it is in stem's commit
658dd5281604eb9c63a91e529501947ecc65ef6b, which will be included in the next
Stem's release, 1.8.0, except ``_date`` because depends on other stem's module.
"""
# XXX: Remove this file when stem releases 1.8.0.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't remove this file when stem releases 1.8.0.

We can only remove this file when everyone who uses sbws 1.2.0 or later, has access to stem 1.8.0 or later. We will need to check Debian package versions for stem and sbws.

from ..util.timestamp import isostr_to_dt_obj

# Converts header attributes to a given type. Malformed fields should be
# ignored according to the spec.

def _str(val):
return val # already a str


def _int(val):
return int(val) if (val and val.isdigit()) else None


def _date(val):
try:
return isostr_to_dt_obj(val)
except ValueError:
return None # not an iso formatted date


def _csv(val):
return map(lambda v: v.strip(), val.split(',')) if val is not None else None


# mapping of attributes => (header, type)

HEADER_ATTR = {
# version 1.1.0 introduced headers

'version': ('version', _str),

'software': ('software', _str),
'software_version': ('software_version', _str),

'earliest_bandwidth': ('earliest_bandwidth', _date),
'latest_bandwidth': ('latest_bandwidth', _date),
'created_at': ('file_created', _date),
'generated_at': ('generator_started', _date),

# version 1.2.0 additions

'consensus_size': ('number_consensus_relays', _int),
'eligible_count': ('number_eligible_relays', _int),
'eligible_percent': ('percent_eligible_relays', _int),
'min_count': ('minimum_number_eligible_relays', _int),
'min_percent': ('minimum_percent_eligible_relays', _int),

# version 1.3.0 additions

'scanner_country': ('scanner_country', _str),
'destinations_countries': ('destinations_countries', _csv),

# version 1.4.0 additions

'time_to_report_half_network': ('time_to_report_half_network', _int),

'recent_stats.consensus_count': ('recent_consensus_count', _int),
'recent_stats.prioritized_relay_lists': ('recent_priority_list_count', _int),
'recent_stats.prioritized_relays': ('recent_priority_relay_count', _int),
'recent_stats.measurement_attempts': ('recent_measurement_attempt_count', _int),
'recent_stats.measurement_failures': ('recent_measurement_failure_count', _int),
'recent_stats.relay_failures.no_measurement': ('recent_measurements_excluded_error_count', _int),
'recent_stats.relay_failures.insuffient_period': ('recent_measurements_excluded_near_count', _int),
'recent_stats.relay_failures.insufficient_measurements': ('recent_measurements_excluded_few_count', _int),
'recent_stats.relay_failures.stale': ('recent_measurements_excluded_old_count', _int),
}
119 changes: 41 additions & 78 deletions sbws/lib/v3bwfile.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
# -*- coding: utf-8 -*-
"""Classes and functions that create the bandwidth measurements document
(v3bw) used by bandwidth authorities."""
(v3bw) used by bandwidth authorities.

See https://gitweb.torproject.org/torspec.git/tree/bandwidth-file-spec.txt for
more information on the meaning of the ``KeyValues``
"""

import copy
import logging
Expand All @@ -15,6 +19,11 @@
TORFLOW_SCALING, SBWS_SCALING, TORFLOW_BW_MARGIN,
TORFLOW_OBS_LAST, TORFLOW_OBS_MEAN,
PROP276_ROUND_DIG, MIN_REPORT, MAX_BW_DIFF_PERC)

# XXX: replace this line by:
Copy link
Contributor

@teor2345 teor2345 May 27, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't write a comment, write code:

Suggested change
# XXX: replace this line by:
try:
from stem.descriptor.bandwidth_file import HEADER_ATTR, _int
except ImportError:
from .stem_bandwidth_file import HEADER_ATTR, _int
except:
raise

# from stem.descriptor.bandwidth_file import HEADER_ATTR, _int
# when stem releases 1.8.0. More information in stem_bandwidth_file.py
from .stem_bandwidth_file import HEADER_ATTR, _int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please write CI tests for sbws that use stem 1.7.0 and stem master.

from sbws.lib.resultdump import ResultSuccess, _ResultType
from sbws.util.filelock import DirectoryLock
from sbws.util.timestamp import (now_isodt_str, unixts_to_isodt_str,
Expand All @@ -27,91 +36,34 @@
KEYVALUE_SEP_V1 = '='
KEYVALUE_SEP_V2 = ' '

# NOTE: in a future refactor make make all the KeyValues be a dictionary
# with their type, so that it's more similar to stem parser.

# Header KeyValues
# =================
# List of the extra KeyValues accepted by the class
EXTRA_ARG_KEYVALUES = ['software', 'software_version', 'file_created',
'earliest_bandwidth', 'generator_started',
'scanner_country', 'destinations_countries']
# number_eligible_relays is the number that ends in the bandwidth file
# ie, have not been excluded by one of the filters in 4. below
# They should be call recent_measurement_included_count to be congruent
# with the other KeyValues.

# XXX: These constants need to be kept even if using Stem's header one.
# Unless Stem's will add a property type to the attributes.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this comment.
Please change the comment so it is clearer.

What does stem need to do?
Is there a stem ticket for this change?

STATS_KEYVALUES = ['number_eligible_relays', 'minimum_number_eligible_relays',
'number_consensus_relays', 'percent_eligible_relays',
'minimum_percent_eligible_relays']

# KeyValues that count the number of relays that are in the bandwidth file,
# but ignored by Tor when voting, because they do not have a
# measured bandwidth.
BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED = [
# Number of relays that were measured but all the measurements failed
# because of network failures or it was
# not found a suitable helper relay
'recent_measurements_excluded_error_count',
# Number of relays that have successful measurements but the measurements
# were not away from each other in X time (by default 1 day).
'recent_measurements_excluded_near_count',
# Number of relays that have successful measurements and they are away from
# each other but they are not X time recent.
# By default this is 5 days, which is the same time the older
# the measurements can be by default.
'recent_measurements_excluded_old_count',
# Number of relays that have successful measurements and they are away from
# each other and recent
# but the number of measurements are less than X (by default 2).
'recent_measurements_excluded_few_count',
]
# Added in #29591
# NOTE: recent_consensus_count, recent_priority_list_count,
# recent_measurement_attempt_count and recent_priority_relay_count
# are not reset when the scanner is stop.
# They will accumulate the values since the scanner was ever started.
BW_HEADER_KEYVALUES_MONITOR = [
# 1.1 header: the number of different consensuses, that sbws has seen,
# since the last 5 days
'recent_consensus_count',
# 2.4 Number of times a priority list has been created
'recent_priority_list_count',
# 2.5 Number of relays that there were in a priority list
# [50, number of relays in the network * 0.05]
'recent_priority_relay_count',
# 3.6 header: the number of times that sbws has tried to measure any relay,
# since the last 5 days
# This would be the number of times a relays were in a priority list
'recent_measurement_attempt_count',
# 3.7 header: the number of times that sbws has tried to measure any relay,
# since the last 5 days, but it didn't work
# This should be the number of attempts - number of ResultSuccess -
# something else we don't know yet
# So far is the number of ResultError
'recent_measurement_failure_count',
# The time it took to report about half of the network.
'time_to_report_half_network',
] + BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED
BANDWIDTH_HEADER_KEY_VALUES_INIT = \
['earliest_bandwidth', 'generator_started',
'scanner_country', 'destinations_countries']\
+ STATS_KEYVALUES \
+ BW_HEADER_KEYVALUES_MONITOR

KEYVALUES_INT = STATS_KEYVALUES + BW_HEADER_KEYVALUES_MONITOR
# List of all unordered KeyValues currently being used to generate the file
UNORDERED_KEYVALUES = EXTRA_ARG_KEYVALUES + STATS_KEYVALUES + \
['latest_bandwidth'] + \
BW_HEADER_KEYVALUES_MONITOR
# List of all the KeyValues currently being used to generate the file
ALL_KEYVALUES = ['version'] + UNORDERED_KEYVALUES
# The stem's attribute name for these Keys starts with this string.
BW_HEADER_KEYVALUES_RECENT_MEASUREMENTS_EXCLUDED = \
[k for n, (k, _) in HEADER_ATTR.items()
if n.startswith("recent_stats.relay_failures.")]

# XXX: The following constants are kept for compatibility with the generator/
# parser here. Use stem's parser/generator when possible and refactor this
# code when stem releases 1.8.0.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't write comments, write code.

Refactor the code now. Use the old code with old stem, and the new code with new stem.

Then you can write CI tests for sbws that use stem 1.7.0 and stem master.

KEYVALUES_INT = [k for k, t in HEADER_ATTR.values() if t is _int]
# # List of all unordered KeyValues currently being used to generate the file
UNORDERED_KEYVALUES = [k for k, _ in HEADER_ATTR.values()]
UNORDERED_KEYVALUES.remove('version')

TERMINATOR = '====='

# Bandwidth Lines KeyValues
# =========================
# Num header lines in v1.X.X using all the KeyValues
NUM_LINES_HEADER_V1 = len(ALL_KEYVALUES) + 2
NUM_LINES_HEADER_V1 = len(HEADER_ATTR) + 2
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 2?
Please make 2 into a named constant.

LINE_TERMINATOR = TERMINATOR + LINE_SEP

# KeyValue separator in Bandwidth Lines
Expand Down Expand Up @@ -250,20 +202,28 @@ class V3BWHeader(object):
when the generator started
"""
def __init__(self, timestamp, **kwargs):
# XXX: do not convert all the arguments to string here, convert them
# only in __str__ and check their actual types.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This comment explains what the code is doing.
But why are the types important?

assert isinstance(timestamp, str)
for v in kwargs.values():
assert isinstance(v, str)
self.timestamp = timestamp
# KeyValues with default value when not given by kwargs

# Initialize all defined attrs,
# XXX: Since using the stem's name for the attributes will require
# other refactoring here, use the name in the actual file.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is "the name in the actual file"?
Do you mean "the key names from the bandwidth file spec"?

header_keys = [k for k, _ in HEADER_ATTR.values()]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not easy to understand if this code is refactoring or changes the behavior of sbws (adds/removes headers).

Could we have a unittest which makes sure that the old code and the new code will output the exact same thing? Perhaps instead of commenting out the code above, you can move it to the tests directory and make a unittest that checks the output of the old with the new code. Would this make sense?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not easy to understand if this code is refactoring or changes the behavior of sbws (adds/removes headers).

Could we have a unittest which makes sure that the old code and the new code will output the exact same thing? Perhaps instead of commenting out the code above, you can move it to the tests directory and make a unittest that checks the output of the old with the new code. Would this make sense?

Makes sense but then need to keep this old code in the test.
Stem has already tests for all the bandwidth files versions using data generated from sbws.
I checked, and including the stem's tests would require adding many more files.

So i came out with other solution that it's something i wanted to do time ago: include real measurements and bandwidth files generated with the test network for the tests.

So i run the integration tests to generate those files with master branch, then i created a test that generates the bandwidth file from the previous results and compare that the bandwidth file is the same.

I'm not pushing these changes yet until knowing your opinion. If you don't like that solution will open other ticket to include them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Other way of ensuring that i didn't accidentally change functionality is the fact that i didn't have to change any test for this PR. It's true though that i don't think there're unit tests checking all the headers, but the previous idea would solve that.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pushed the new test

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like this new test very much. I think it's a good step.
I added a few comments while I was trying to understand what the test does.

[setattr(self, k, v) for k, v in kwargs.items() if k in header_keys]

# Then initialize the ones with defaults, if they weren't already.
# Ideally these defaults could be defined in the HEADER_ATTR constant.
self.version = kwargs.get('version', SPEC_VERSION)
self.software = kwargs.get('software', 'sbws')
self.software_version = kwargs.get('software_version', __version__)
self.file_created = kwargs.get('file_created', now_isodt_str())
# latest_bandwidth should not be in kwargs, since it MUST be the
# same as timestamp
self.latest_bandwidth = unixts_to_isodt_str(timestamp)
[setattr(self, k, v) for k, v in kwargs.items()
if k in BANDWIDTH_HEADER_KEY_VALUES_INIT]

def __str__(self):
if self.version.startswith('1.'):
Expand Down Expand Up @@ -337,9 +297,10 @@ def from_lines_v1(cls, lines):
log.warn('Terminator is not in lines')
return None
ts = lines[0]
header_keys = [k for k, _ in HEADER_ATTR.values()]
kwargs = dict([l.split(KEYVALUE_SEP_V1)
for l in lines[:index_terminator]
if l.split(KEYVALUE_SEP_V1)[0] in ALL_KEYVALUES])
if l.split(KEYVALUE_SEP_V1)[0] in header_keys])
h = cls(ts, **kwargs)
# last line is new line
return h, lines[index_terminator + 1:-1]
Expand Down Expand Up @@ -1056,6 +1017,8 @@ def is_max_bw_diff_perc_reached(bw_lines,
sum_bw = sum([l.bw for l in bw_lines
if getattr(l, 'consensus_bandwidth', None)
and getattr(l, 'unmeasured', 0) == 0]) * 1000
# For tests without consensus bandwidth, avoid division by 0
sum_bw = sum_bw or 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is weird to see a test workaround in a code file.

Please separate the tests and the code.

# Percentage difference
diff_perc = (
abs(sum_consensus_bw - sum_bw)
Expand Down
64 changes: 64 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
"""Common pytest configuration for unit and integration tests."""
import argparse
import os.path
import pytest
from unittest import mock

from sbws.lib import resultdump
from sbws.util import config
from sbws.util.parser import create_parser


FAKE_TIME = 1557556118


@pytest.fixture(scope='session')
def parser():
return create_parser()
Expand All @@ -29,3 +38,58 @@ def readlines(self, name):
with self.open(name, "r") as f:
return f.readlines()
return D(request.fspath.dirpath("data"))


@pytest.fixture()
def tests_data_dir(scope="global"):
"""Root data directory, common to all the tests."""
here = os.path.abspath(os.path.dirname(__file__))
return os.path.join(here, "data")


@pytest.fixture()
def sbws_dir(tests_data_dir):
return os.path.join(tests_data_dir, ".sbws")


# The following fixtures are similar to the ones defined in unit and
# integration sub-directories, but they are for the tests data to be read,
# not to write.
@pytest.fixture()
def sbws_args(tests_data_dir):
config_fpath = os.path.join(tests_data_dir, ".sbws.ini")
sbws_args = argparse.Namespace(**{'config': config_fpath})
return sbws_args


@pytest.fixture()
def sbws_conf(sbws_args, sbws_dir):
"""Default configuration with sbws home in the tmp test dir."""
conf = config.get_config(sbws_args)
conf['paths']['sbws_home'] = sbws_dir
conf['paths']['v3bw_fname'] = "${v3bw_dname}/latest.v3bw"
return conf


@pytest.fixture()
@mock.patch('time.time')
def measurements(mock_time, sbws_conf):
# Because load_recent_results_in_datadir will use time.time()
# to decide which results are recent.
mock_time.return_value = FAKE_TIME
measurements = resultdump \
.load_recent_results_in_datadir(
sbws_conf.getint('general', 'data_period'),
sbws_conf.getpath('paths', 'datadir'))
return measurements


@pytest.fixture
def bandwidth_file_headers():
d = {
'earliest_bandwidth': "2019-05-11T06:26:41",
'file_created': "2019-05-11T06:32:32",
'generator_started': "2019-05-11T06:26:28",
'latest_bandwidth': "2019-05-11T06:28:38",
}
return d
30 changes: 30 additions & 0 deletions tests/data/.sbws.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
[general]
# Days into the past that measurements are considered valid
data_period = 1

[paths]
sbws_home = .sbws
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps we don't need to hide this data in .sbws? I'm not used to files in source code being hidden. Why are we doing it here?


[scanner]
country = ZZ

[destinations]
local = on

[destinations.local]
; url = https://localhost:28888/sbws.bin
url = http://127.0.0.1:28888/sbws.bin
verify = False
country = ZZ

[tor]
extra_lines =
DirAuthority auth1 orport=2002 no-v2 v3ident=D7DBC517EFD2BA1A5012CF1BD0BB38F17C8160BD 127.10.0.1:2003 AA45C13025C037F056E734169891878ED0880231
DirAuthority auth2 orport=2002 no-v2 v3ident=4EE103A081F400E6622F5461D51782B876BB5C24 127.10.0.2:2003 E7B3C9A0040D628DAC88B0251AE6334D28E8F531
DirAuthority auth3 orport=2002 no-v2 v3ident=8B85069C7FC0593801E6491A34100264FCE28980 127.10.0.3:2003 35E3B8BB71C81355649AEC5862ECB7ED7EFDBC5C
TestingTorNetwork 1
NumCPUs 1

[logging]
level = debug
to_stdout_level = ${level}
Loading