-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdiffTables.py
197 lines (169 loc) · 9.82 KB
/
diffTables.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
'''
Created on Jul 30, 2013
@author: julian Garrido
'''
#from time import localtime
import MySQLdb
import numpy
from astropy.io.votable import parse_single_table
import numpy.ma as ma
import ConfigParser
import os
from numpy import array
import numpy as np
class diff_DB_CDS(object):
'''
This class is able to download a votable from CDS and compare it with a table in a database
'''
def __init__(self, server, db, user, password):
'''
Constructor
'''
self.password = password
self.server = server
self.user = user
self.db = db
def getTableFromCDS(self, url):
'''
It retrieves a votable from CDS.
example: "url=http://vizier.u-strasbg.fr/viz-bin/votable/-A?-source=J/A+A/534/A102"
'''
self.url_cds = url
self.cdstable = parse_single_table(url)
def getTableFromDB(self, query, typeColumnsList):
connection = MySQLdb.connect(host=self.server, user=self.user, passwd=self.password, db=self.db ) #,use_unicode=True, charset = ...
cursor = connection.cursor()
cursor.execute(query)
#This method doesn't work if there are Nulls in the DB. --> None in the python array
#self.dbtable = numpy.fromiter(cursor, typeColumnsList)
#Build the numpy array
#results = cursor.fetchall()
#results = list(results)
#for index_r, row in enumerate(results):
# newrow = list(row)
# for index_c, col in enumerate(newrow) :
# if col is None :
# newrow[index_c]=numpy.nan
# results[index_r] = tuple(newrow)
# #if changes :
# # results results[index_r]
values = cursor.fetchall()
ncols = len(values[0])
nrows = len(values)
arr = ma.zeros(nrows, dtype=typeColumnsList)
for i, row in enumerate(values):
for j, cell in enumerate(values[i]):
if values[i][j] is None:
arr.mask[i][j] = True
else:
arr.data[i][j] = cell
self.dbtable = arr
connection.close()
def compareTables(self, columnNamesListDB, ColumnNamesListCDS, tol):
'''
Call getTaleFromDB and getTAbleFromCds before calling this method.
This method receive a list of names for the dbtable and a list of names for the cdstable.
also an array of tolerance with an element for each column. If the column is not numeric,
the value would be ignored.
If two columns contain numbers it checks if the difference is 0. with the absolute tolerance
that is provided as input
If two columns contain strings it checks if they are equal.
If two columns have different types, it transform them two strings and checks if they are equal.
Remark: string types must be provided as static strings e.g. S3 is valid but not str
Return True if all the columns are equal.
Restriction: Columns to be compared must be in the same position in both lists.
'''
if len(columnNamesListDB) != len(ColumnNamesListCDS):
raise Exception, 'Different number of columns'
message = ""
for i in range(0, len(columnNamesListDB)):
data1 = self.cdstable.array[ColumnNamesListCDS[i]]
data2 = self.dbtable[columnNamesListDB[i]]
if len(data1) != len(data2) :
message = "There are " + str(len(data1)) + " rows in CDS and " + str(len(data2)) + " rows in the DB."
raise Exception, message
#print "column: " + str(i)
if not numpy.issubdtype(data1.dtype, str) and not numpy.issubdtype(data2.dtype, str):
#data2_ma = ma.masked_array(data2, mask=numpy.isnan(data2).tolist())
if not ma.allclose(data1, data2, atol=tol[i]) :
same = ma.allclose(data1, data2, atol=tol[i])
#msn = "Position: " + str(i) + ", column cds: " + str(ColumnNamesListCDS[i]) + ", column db: " + str(columnNamesListDB[i]) + "\n"
#print msn
#print "len data1: " + str(len(data1)) + ", len data2: " + str(len(data2)) + " \n"
#(abs(data1-data2) + tol[i]) <= 0 c = (abs(data1-data2) + tol[i]) c = ma.masked_where(c <= 0, c)
condition = (abs(data1-data2) + tol[i]) > 0
#condition = condition.data
ix = np.where(condition)
ix = ix[0]
#r = array(range(len(condition)))
#r(condition)
#difference = [(x,y) for x,y,z in zip(data1, data2, same) if z == False]
difference = [(index, x,y) for index,x,y in zip(range(len(data1)), data1, data2) ]
final_difference = [x for j_index, x in enumerate(difference) if j_index in ix]
#print str(len(final_difference))
message += ColumnNamesListCDS[i] + ' in CDStable and ' + columnNamesListDB[i] + ' in db are not equal for some rows: ' + str(final_difference) + '\n'
elif numpy.issubdtype(data1.dtype, str) and numpy.issubdtype(data2.dtype, str):
if ((sum(data1 == data2) ) != len(data1)):
same = data1==data2
difference = [(x,y) for x,y,z in zip(data1, data2, same) if z == False]
message += ColumnNamesListCDS[i] + ' in CDStable and ' + columnNamesListDB[i] + ' in db are not equal ('+ str(sum(data1 != data2)) + ' items): ' + str(difference) + '\n'
#print data1[0], data2[0], data1[0]==data2[0]
elif not numpy.issubdtype(data1.dtype, str) :
newcolumn = numpy.array(map(str, data1))
if (sum(newcolumn == data2) != len(data1)):
message += ColumnNamesListCDS[i] + ' in cdstable and ' + columnNamesListDB[i] + ' in db are not equal and have different types. ('+ str(sum(data1 != data2)) + ' items).'
elif not numpy.issubdtype(data2.dtype, str) :
newcolumn = numpy.array(map(str, data2))
if (sum(data1 == newcolumn) != len(data1)):
message += ColumnNamesListCDS[i] + ' in cdstable and ' + columnNamesListDB[i] + ' in db are not equal and have different types. ('+ str(sum(data1 != data2)) + ' items).'
if(len(message) > 0) :
#print "Errors: " + message
raise Exception, message
else:
return True
if __name__ == '__main__':
#
# diff = diff_DB_CDS("amiga.iaa.es", "CIG_CO_LISENFELD11", "amiga", "")
#
# cdsnames = ['CIG', 'Dist', 'Vel','D25','i','TT','Mi','log(LB)','l_log(LFIR)','log(LFIR)','log(LK)','Det','MH2c','MH2e']
# url="http://vizier.u-strasbg.fr/viz-bin/votable/-A?-source=J/A+A/534/A102"
# diff.getTableFromCDS(url)
#
#
# #dtypes=[('cig',int), ('DIST', int), ('VEL', int), ('D25', float), ('POS_INCL_LOS', float), ('MType', int), ('IA', int), ('log(LB)', float), ('l_log(LFIR)', str), ('log(LFIR)', float), ('log(LK)', float), ('Det', int), ('log(MH2c)', float), ('log(MH2m)', float), ('log(MH2e)', float), ('Tel', int), ('BibCode', int)]
# dtypes=[('cig',int), ('DIST', int), ('VEL', int), ('D25', float), ('POS_INCL_LOS', float), ('MType', int), ('IA', int), ('log(LB)', float), ('l_log(LFIR)', 'S2'), ('log(LFIR)', float), ('log(LK)', float), ('Det', int), ('log(MH2c)', float), ('log(MH2e)', float)]
# #I have to remove ('log(MH2m)', float), tel y bibcode, from the query because the cds doesn't return these columns: , t5.`log(MH2m)`
# dbnames =[pair[0] for pair in dtypes]
# #query = "select t1.cig, t1.dist, t1.vel, t1.d25, t1.pos_incl_los, t1.mtype, t1.ia, t1.`log(LB)`, t1.`l_log(lfir)`, t1.`log(lfir)`, t1.`log(LK)`, t5.Det, t5.`log(MH2c)`, t5.`log(MH2m)`, t5.`log(MH2e)`, t5.Tel, t5.BibCode from TABLE1 as t1, TABLE5 as t5 WHERE t1.cig = t5.cig"
# query = "select t1.cig, t1.dist, t1.vel, t1.d25, t1.pos_incl_los, t1.mtype, t1.ia, t1.`log(LB)`, t1.`l_log(lfir)`, t1.`log(lfir)`, t1.`log(LK)`, t5.Det, t5.`log(MH2c)`, t5.`log(MH2e)` from TABLE1 as t1, TABLE5 as t5 WHERE t1.cig = t5.cig"
# diff.getTableFromDB(query, dtypes)
#
#
# tolerance = numpy.array([0.0001, 0.0001, 0.0001, 0.001, 0.0001, 0.0001, 0.0001, 0.001, 0, 0.001, 0.001, 0.0001, 0.001, 0.001 ])
#
#
# #Check why l_log(LFIR) is not equal in both tables: the answer is that one contains spaces and the other nothing.
# #print (diff.cdstable.array['l_log(LFIR)'])
# #print (diff.dbtable['l_log(LFIR)'])
# #same = diff.cdstable.array['l_log(LFIR)']==diff.dbtable['l_log(LFIR)']
# #cig_diff = diff.cdstable.array['CIG']-diff.dbtable['cig']
# #[(z,x,y) for x,y,z,k in zip(list(diff.cdstable.array['l_log(LFIR)']), diff.dbtable['l_log(LFIR)'], diff.dbtable['cig'], same) if k == False]
#
# #print dbnames
# diff.compareTables(dbnames, cdsnames, tolerance)
#new case
config = ConfigParser.RawConfigParser(allow_no_value=True)
config.read(['amigaDB.cfg', os.path.expanduser('~/.amigaDB.cfg')])
user = config.get("rootAMIGA", "user")
password = config.get("rootAMIGA", "passw")
diff = diff_DB_CDS("amiga.iaa.es", "PAPERS_ISOLATION_VERLEY07b", user, password)
cdsnames = ['CIG', 'NName']
#url="http://vizier.u-strasbg.fr/viz-bin/votable?-source=J/A%2bA/470/505/table3&-out.max=unlimited"
url="/home/julian/Escritorio/votables/vizier_votable.vot"
diff.getTableFromCDS(url)
dtypes=[ ('cig', int), ('NName', int)]
dbnames =[pair[0] for pair in dtypes]
query = "SELECT `cig`, `NName` FROM `TABLE3`"
diff.getTableFromDB(query, dtypes)
tolerance = numpy.array([0, 0])
diff.compareTables(dbnames, cdsnames, tolerance)