Skip to content

Commit

Permalink
Merge pull request #169 from marblestation/reindex_checksize
Browse files Browse the repository at this point in the history
Make index rebuild check env var for min committed docs
  • Loading branch information
marblestation authored Jun 20, 2023
2 parents 9db8613 + c7a0d64 commit 9e7dbbb
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 16 deletions.
10 changes: 5 additions & 5 deletions .github/workflows/python_actions.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ jobs:
--health-timeout 5s
--health-retries 5
steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
python-version: 2.7
python-version: 3.8

- name: Install dependencies
run: |
Expand All @@ -57,8 +57,8 @@ jobs:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
- uses: actions/checkout@v3
- uses: actions/setup-python@v3
with:
python-version: 3.8

Expand Down
2 changes: 1 addition & 1 deletion adsmp/tests/test_reindex.py

Large diffs are not rendered by default.

43 changes: 33 additions & 10 deletions scripts/reindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import pickle
import requests
import time
import json

proj_home = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
if proj_home not in sys.path:
Expand Down Expand Up @@ -126,12 +127,20 @@ def run():
time.sleep(30)
logger.info('Solr has registered a new searcher')

logger.info('Waiting for the new collection to have a minimum number of commited documents')
# all went well, verify the numDocs is similar to the previous collection
time.sleep(30)
cores = requests.get(cores_url + '?wt=json').json()
logger.info('core info is: {}'.format(cores))
verify_collection2_size(cores['status']['collection2'])
logger.info('Successfully verified the collection')
min_committed_docs = os.environ.get('MIN_COMMITTED_DOCS', 17500000)
min_index_size = os.environ.get('MIN_COMMITTED_DOCS', 200) # GB
for _ in range(24): # Check every 5 minutes for 2 hours max
time.sleep(300)
verified, verified_msg = verify_collection2_size(cores_url, min_committed_docs, min_index_size)
if verified:
break
if verified:
logger.info(verified_msg)
else:
raise Exception(verified_msg)


# all is well; swap the cores!
r = requests.get(cores_url + '?action=SWAP&core=collection2&other=collection1&wt=json')
Expand Down Expand Up @@ -171,11 +180,25 @@ def write_lockfile(lockfile, data):
pickle.dump(data, f)


def verify_collection2_size(data):
if data['index'].get('numDocs', 0) <= 15117785:
raise Exception('Too few documents in the new index: %s' % data['index'].get('numDocs', 0))
if data['index'].get('sizeInBytes', 0) / (1024*1024*1024.0) <= 146.0: # index size at least 146GB
raise Exception('The index is suspiciously small: %s' % (data['index'].get('sizeInBytes', 0) / (1024*1024*1024.0),))
def verify_collection2_size(cores_url, min_committed_docs, min_index_size):
# Try to get info from solr
try:
response = requests.get(cores_url + '?wt=json')
#response.raise_for_status() # Raise an exception for non-2xx status codes
cores = response.json()
except (requests.exceptions.RequestException, json.decoder.JSONDecodeError, ValueError, TypeError) as e:
return (False, str(e))
# Extract key values
data = cores.get('status', {}).get('collection2', {})
num_docs = data.get('index', {}).get('numDocs', 0)
index_size = data.get('index', {}).get('sizeInBytes', 0) / (1024*1024*1024.0) # GB
#
logger.info('New collection has {} committed entries and the index size is {:.2f} GB'.format(num_docs, index_size))
if num_docs <= min_committed_docs:
return (False, 'Too few committed documents in the new index: {}'.format(num_docs))
if index_size <= min_index_size:
return (False, 'The new index is suspiciously small: {:.2f} GB'.format(index_size))
return (True, 'Successfully verified the new collection')


def str_to_datetime(s):
Expand Down

0 comments on commit 9e7dbbb

Please sign in to comment.