Skip to content

Commit

Permalink
use datasets commandline because API will be deprecated
Browse files Browse the repository at this point in the history
  • Loading branch information
tongzhouxu committed Apr 24, 2024
1 parent 9840dfb commit b534d79
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 67 deletions.
84 changes: 19 additions & 65 deletions mashpit/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,71 +219,25 @@ def calculate_centroid(df_metadata,pdg_acc,tmp_folder):
def download_and_sketch_assembly(gca_acc_list,hash_number,kmer_size,tmp_folder):
logging.info('Downloading and sketching assemblies...')
time_start_download_and_sketch_assembly = time.time()
with DatasetsApiClient() as api_client:
api_instance = DatasetsGenomeApi(api_client)
if len(gca_acc_list) <= 400:
api_response = api_instance.download_assembly_package(
gca_acc_list,
_preload_content=False,
hydrated='DATA_REPORT_ONLY',
_request_timeout=600,
_return_http_data_only=True
)
zip_file = os.path.join(tmp_folder,'assembly.zip')
with open(zip_file, 'wb') as f:
f.write(api_response.data)
api_response.close()
api_client.close()
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(os.path.join(tmp_folder,'assembly'))
# try to rehydrate the dataset, if failed, try again after 5 seconds
while True:
try:
subprocess.run(['datasets', 'rehydrate', '--directory', os.path.join(tmp_folder,'assembly')],
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
break
except subprocess.CalledProcessError:
time.sleep(5)
logging.error(f'Failed to rehydrate {tmp_folder}/assembly. Trying again...')
else:
# divide the list into chunks if the list is too long for HTTP request
chunk_list = [gca_acc_list[i:i+400] for i in range(0, len(gca_acc_list), 400)]
j = 0
for sublist in chunk_list:
api_response = api_instance.download_assembly_package(
sublist,
_preload_content=False,
hydrated='DATA_REPORT_ONLY',
_request_timeout=600,
_return_http_data_only=True
)
zip_file = os.path.join(tmp_folder,'assembly_'+str(j)+'.zip')
with open(zip_file, 'wb') as f:
f.write(api_response.data)
api_response.close()
j = j+1
api_client.close()
# get a list of all zip files in tmp folder
file_list = glob.glob(os.path.join(tmp_folder,'assembly_' + '*'))
os.makedirs(os.path.join(tmp_folder,'assembly','ncbi_dataset','data'))
for file in file_list:
with zipfile.ZipFile(file, 'r') as zip_ref:
folder_name = file.replace('.zip','')
zip_ref.extractall(os.path.join(tmp_folder,folder_name))
# try to rehydrate the dataset, if failed, try again after 5 seconds
while True:
try:
subprocess.run(['datasets', 'rehydrate', '--directory', os.path.join(tmp_folder,folder_name)],
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
break
except subprocess.CalledProcessError:
time.sleep(5)
logging.error(f'Failed to rehydrate {folder_name}. Trying again...')
source_folder = os.path.join(tmp_folder,folder_name,'ncbi_dataset','data','GCA_*')
destination_folder = os.path.join(tmp_folder,'assembly','ncbi_dataset','data')
files_to_move = glob.glob(source_folder)
for f in files_to_move:
shutil.move(f, destination_folder)
gca_acc_file = os.path.join(tmp_folder,'gca_list.txt')
with open(gca_acc_file, 'w') as f:
for item in gca_acc_list:
f.write("%s\n" % item)
subprocess.run(['datasets', 'download', 'genome','accession', '--inputfile', gca_acc_file, '--filename', 'assembly.zip','--dehydrated'])
# move the zip file to the tmp folder
zip_file = os.path.join(tmp_folder,'assembly.zip')
shutil.move('assembly.zip',zip_file)
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
zip_ref.extractall(os.path.join(tmp_folder,'assembly'))
# try to rehydrate the dataset, if failed, try again after 5 seconds
while True:
try:
subprocess.run(['datasets', 'rehydrate', '--directory', os.path.join(tmp_folder,'assembly')],
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
break
except subprocess.CalledProcessError:
time.sleep(5)
logging.error(f'Failed to rehydrate {tmp_folder}/assembly. Trying again...')
time_end_download = time.time()
logging.info(f'Downloaded {len(gca_acc_list)} assemblies in {round(time_end_download-time_start_download_and_sketch_assembly,2)} seconds.')
os.mkdir(os.path.join(tmp_folder,'signature'))
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,6 @@ mashpit @ file:///Users/tongzhou/mashpit
matplotlib==3.8.3
matplotlib-inline==0.1.6
natsort==8.4.0
ncbi-datasets-pylib==14.6.5
numpy==1.26.4
packaging==24.0
pandas==2.2.1
Expand Down
1 change: 0 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
entry_points={'console_scripts':['mashpit=mashpit.mashpit:main']},
install_requires=[
'sourmash~=4.6.1',
'ncbi-datasets-pylib~=14.6.2',
'pandas',
'biopython',
'scikit-bio',
Expand Down

0 comments on commit b534d79

Please sign in to comment.