diff --git a/adsingestp/parsers/jats.py b/adsingestp/parsers/jats.py index 593033a..6aa398e 100644 --- a/adsingestp/parsers/jats.py +++ b/adsingestp/parsers/jats.py @@ -16,6 +16,7 @@ class JATSAffils(object): regex_email = re.compile(r"^[a-zA-Z0-9+_.-]+@[a-zA-Z0-9-]+(\.[a-zA-Z0-9-]+)+") + regex_auth_xid = re.compile(r"^A[0-9]+$") def __init__(self): self.contrib_dict = {} @@ -94,6 +95,7 @@ def _fix_affil(self, affstring): # check for empty strings with commas check_a = a.replace(",", "") if check_a: + a = re.sub("\\(e-*mail:\\s*,+\\s*\\)", "", a) a = a.replace("\\n", ",") a = a.replace(" —", "—") a = a.replace(" , ", ", ") @@ -103,6 +105,7 @@ def _fix_affil(self, affstring): a = re.sub("^(\\s*,+\\s*)+", "", a) a = re.sub("(\\s*,\\s+)+", ", ", a) a = re.sub("(,\\s*)+$", "", a) + a = re.sub("\\s+$", "", a) if self.regex_email.match(a): emails.append(a) else: @@ -567,15 +570,18 @@ def parse(self, article_metadata): for aff in contrib_aff: # check and see if the publisher defined an email tag inside an affil (like IOP does) nested_email_list = aff.find_all("ext-link") + key = aff.get("id", default_key) for e in nested_email_list: if e.get("ext-link-type", None) == "email": - key = e["id"] + if e.get("id", None): + ekey = e["id"] + else: + ekey = key value = e.text # build the cross-reference dictionary to be used later - self.email_xref[key] = value + self.email_xref[ekey] = value e.decompose() - key = aff.get("id", default_key) # special case: get rid of ... aff = self._decompose(soup=aff, tag="sup") aff, aff_extids_tmp = self._get_inst_identifiers(aff) @@ -586,11 +592,25 @@ def parse(self, article_metadata): affstr = aff.get_text(separator=", ").strip() (affstr, email_list) = self._fix_affil(affstr) - if email_list: - self.email_xref[key] = email_list + if not self.email_xref.get(key, None): + if email_list: + self.email_xref[key] = email_list + else: + self.email_xref[key] = "" self.xref_dict[key] = affstr self.xref_xid_dict[key] = aff_extids_tmp + # special case: publisher defined aff/email xrefs, but the xids aren't + # assigned to authors; xid is typically of the form "A\d+" + # publisher example: Geol. Soc. London (gsl) + count_auth = len(authors_out) + count_xref = len(self.xref_dict.keys()) + if count_auth == count_xref: + for auth, xref in zip(authors_out, self.xref_dict.keys()): + if self.regex_auth_xid.match(xref): + if not auth.get("aff", []) and not auth.get("xaff", []): + auth["xaff"] = [xref] + self.contrib_dict = {"authors": authors_out, "contributors": contribs_out} # now get the xref keys outside of contrib-group: @@ -1027,6 +1047,10 @@ def _parse_ids(self): self.base_metadata["ids"]["pub-id"].append( {"attribute": "manuscript", "Identifier": self._detag(d, [])} ) + elif id_type == "url": + self.base_metadata["ids"]["pub-id"].append( + {"attribute": "url", "Identifier": self._detag(d, [])} + ) elif id_type == "other": self.base_metadata["ids"]["pub-id"].append( {"attribute": "other", "Identifier": self._detag(d, [])} @@ -1232,7 +1256,11 @@ def parse(self, text, bsparser="lxml-xml"): raise XmlLoadException(err) document = d.article - front_meta = document.front + # front_meta = document.front + try: + front_meta = document.front + except Exception as err: + raise XmlLoadException("No front matter found, stopping: %s" % err) self.back_meta = document.back self.article_meta = front_meta.find("article-meta") diff --git a/tests/stubdata/input/jats_gsl_unkeyed_xref.xml b/tests/stubdata/input/jats_gsl_unkeyed_xref.xml new file mode 100644 index 0000000..8eaada2 --- /dev/null +++ b/tests/stubdata/input/jats_gsl_unkeyed_xref.xml @@ -0,0 +1,110 @@ + + +
+geeageochemGeochemistry: Exploration, Environment, AnalysisGeochemistry: Exploration, Environment, AnalysisGeochemistry: Exploration, Environment, Analysis1467-78732041-4943The Geological Society of London10.1144/geochem.1.2.119119Regular ArticleA comparison of unsupervised neural networks and k-means clustering in the analysis of multi-element stream sediment data +A. P. Clare + + + + and D. R. Cohen + + + + + + Neural Mining Solutions, 1 Alfred Street, Sydney, NSW 2000, AustraliaSchool of Geology, University of New South Wales, Sydney, NSW 2052, Australia (e-mail: d.cohen@unsw.edu.au)5200112119134© 2001 AAG/The Geological Society of London2001 +

Isolation of complex patterns of correlation between variables, association among samples and anomaly identification, through conventional parametric multivariate statistical procedures, may be obscured by the presence of multivariate outliers and non-normal variable distributions. Procedures such as k-means clustering generally require substantial data pre-processing. Unsupervised neural networks (UNN) have the capacity to cluster multivariate data, using a modified form of the standard unsupervised Kohonen self-organizing map that is non-linear, non-parametric, rapid and robust. The number of clusters into which samples are allocated is determined by the unsupervised neural network and is directly dependent upon the original input data.

+

UNN and k-means clustering was performed on stream sediment geochemical data from 1670 sub-catchments in the northeast region of New South Wales. Both methods produced clusters for the feldspar-associated elements that were closely related to sub-catchment geology and topography. UNN clustering revealed more subtle variations within the major lithological groups. UNN clustering of Cu–Pb–Zn produced ten main clusters and identified 26 anomalies, that were mainly from sub-catchments, containing significant base metal mineralization occurrences. K-means clustering of transformed Cu–Pb–Zn yielded five major clusters and only 19 anomalies. Progressive increase in k from eight to 20 did not substantially alter the k-means classification of samples between common groups and anomalies. Some catchments identified only as anomalous by UNN clustering contain known base metal mineralization.

+
+ exploration geochemistry + ; neural network + ; clustering + ; anomaly + ; New England + hwp-legacy-fpage119hwp-legacy-docheadRegular Article
INTRODUCTION

A variety of techniques exist to quantify complex patterns of correlation between variables in large, multi-element geochemical datasets, and to establish association among samples. Most of these techniques require a reduction in the dimensionality of the dataset. Defining common groups (or clusters) that relate to regional scale geological processes and isolating anomalous samples is a fundamental task of geochemical analysis (Cheng et al. 1996). Anomaly detection and pattern recognition using conventional parametric statistical procedures, such as factor analysis and parametric clustering techniques, may be obscured by the presence of multivariate outliers and non-normal variable distributions (Chork 1990; Chork & Rousseeuw 1992). Multivariate anomalies may be identified using techniques such as modelling of Mahalanobis D2 distances using χ2 probability plots (Garrett 1989) prior to clustering; however, this approach can be time consuming.

K-means clustering is one method for reducing data to groups of similar objects. It is of the partitional clustering type (MacQueen 1967) as it attempts to directly decompose data into a set of disjoint clusters. One problem with k-means clustering is that the clustering algorithms prefer certain cluster shapes (multivariate distributions), and the algorithms tend to assign data to clusters of such shapes even if no such clusters actually exist in the data (Kaski 1997). Another potential problem involves the choice of the number of clusters. Good initialization of the cluster centroids may also be crucial as real data clusters may be left empty if their centroids are distant from the initial estimates at the commencement of clustering (Kaski 1997).

Neural networks are an alternate approach to revealing complex relationships within multivariate datasets and identifying anomalies, and are less dependent on the characteristics of the input data than conventional methods (Benediktsson et al. 1990; Foody 1997). Unlike conventional statistical or rule-based systems, neural networks are not programmed to perform particular tasks according to strict rules, but to isolate patterns in data by training on historical data using a learning algorithm. Unsupervised neural networks do not use training data but look for patterns and relationships within the entire input dataset, and organize the network into a set of unique classes or groups (Dayhoff 1990). One potential benefit resulting from the design of unsupervised neural networks is that outlying observations can be included in the dataset without significantly affecting the main patterns and relationships identified. The neural network takes the spread of data into account and those samples not falling into a common cluster or pattern usually remain in smaller groups of anomalous or uncommon patterns.

The self-organizing map (SOM), developed by Kohonen (1995), is a neural network algorithm that has been used for a wide variety of applications, ranging from optimization of communications networks to predictive modelling in the finance industry. Comparison of clustering methods by Kaski (1997) indicates that k-means clustering is very closely related to SOM-based methods. An SOM is, however, both a method of clustering (which achieves dimension reduction) and a technique for non-linear projecting of data onto a lower-dimensional display (Kaski 1997).

This study compares the capacity of unsupervised neural network (UNN) clustering on non-preprocessed regional stream sediment data, to reduce dimensionality, define multivariate geochemical patterns and identify multivariate anomalies, with that of conventional k-means clustering.

K-means clustering

K-means clustering seeks to assign multivariate observations to a predetermined number of groups (k) each containing nk observations and a group centroid (xk) (Howarth 1983). Clustering commences with the random allocation of k original observations as the group centroids. Each observation is assigned to the group to whose centroid they are nearest. Once all observations are allocated to groups, the new group centroids are then recalculated and the original observations reallocated to the new groups. The process is repeated until the overall within-group scatter (W) is minimized (or the change in W becomes insignificant).W=ki=1nk(xikxk)2 + As well as allocating observations to groups, the posterior probability of an observation being a member of a group may be determined (R. G. Garrett, pers. comm.). From a geochemical perspective, anomalies may be defined as clusters with only a few observations or as observations displaying a low probability of membership of any of the main clusters.

K-means clustering is generally affected by the geometry of the multivariate data distributions. Whereas distortions can be reduced by rescaling variables (normalizing individual variables against the covariance matrix) or by transforming the original variables to normal distributions (Mancey & Howarth 1980), it is difficult to remove the bias induced by the inclusion of marginally anomalous observations within the various groups. As such, k-means clustering is generally used to partition data into major groups rather than to detect anomalies.

Neural networks

The unsupervised neural network applied in this study is a modified form of the standard unsupervised SOM, in which patterns in the n-dimensional input data are transformed into lower dimensional space which is topologically ordered (Kohonen 1989, 1990, 1995; Dayhoff 1990). The SOM comprises the input layers (consisting of m samples and n layers or variables) and a competitive layer organized as a grid of units (Fig. 1). For each sample, a vector (X) representing the n variables is compared with the vectors representing the weight vectors (W) of each unit in the network’s competitive layer. The software used in this study (Prospect Explorer™) commenced with a single vector, arbitrarily weighted according to the first observation encountered in the dataset. A matching value is computed for each unit in the competitive layer matrix. The unit with the greatest similarity to the input vector is then selected. The neighbourhood of units around the selected unit is then defined and the weights for all units in that neighbourhood adjusted, such that:Wij(new)=Wij(old)+ΔWij +wherewhereΔWij=η(XWij) +η is the learning rate (a small positive number that decreases as training continues).

Hence, the selected unit and its neighbourhood becomes more like the input pattern vector X and is more likely to be selected should the same or a similar input pattern be presented to the network. This procedure is then repeated so that eventually all patterns have been presented to the network numerous times over and similar patterns have been grouped (Kamgar-Parsi et al. 1990). The neural network ceases to group the data once the rate of change in the clustering falls below an arbitrary minimum threshold.

In most applications, observations are not strictly allocated to a specific cluster, but exhibit varying degrees of association (similarity) with the weighted vectors representing the derived clusters. The similarity of individual observations (as represented by their vectors) to the vectors representing each of the neural clusters established may be determined, or observations can be allocated to the cluster to which it displays the greatest similarity (Fig. 2). An anomalous cluster may, therefore, be defined as one where very few observations display a significant degree of similarity to the weighted vector representing that cluster. Anomalous observations are then defined as those displaying a high degree of similarity with those anomalous cluster vectors.

STUDY AREA

The study area, located in the northeastern corner of New South Wales, covers c. 30 000 km2 and contains the catchments of the Clarence, Richmond and Tweed Rivers (Fig. 3). The western, southern and northeastern regions contain Silurian to Carboniferous metasedimentary rocks and associated volcanics, along with Permian volcanics and the Permo-Triassic plutonic suites of the New England Batholith. The metasedimentary rocks and volcanics form an accretionary complex. East of the Clarence River lies the Mesozoic Clarence-Moreton Basin, which contains sediments derived from the surrounding Palaeozoic basement. The basin is bounded on its western margin by a major meridonal structural zone containing laterally extensive serpentinites. In the northeast and central south of the study area, extensive Tertiary basalt flows overlie much of the surrounding Palaeozoic basement and parts of the Clarence-Moreton Basin.

Various styles of mineralization are present in the area, although most occurrences are low grade and sub-economic (Gilligan & Barnes 1990). Mineralization is predominantly of the quartz vein-hosted, skarn, greisen or epithermal styles, with the main commodities being Au, As, Sb, W, Sn, Mo, Cu and Zn. The principal Au and base metal deposits are located around Drake, Timbarra, Tooloom, Baryulgil, Jackadgery, Dalmoreton and Glen Innes. A comprehensive mineral occurrence database has been assembled by the NSW Department of Mineral Resources (Barnes et al. 1996).

The topography is influenced by the underlying geology. The western margin of the study area is characterized by high (up to 1200 m altitude), gently undulating plateau country associated with the New England Batholith. The Clarence-Moreton Basin displays subdued topographic relief and forms extensive lowland floodplains for the major river systems. The older Palaeozoic rocks and the Cenozoic basalts form more dissected terrains.

DATASET

The region was divided into 1670 sub-catchments, ranging in area from 8 to 50 km2. The choice of sampling density related to the scale of spatial variation in rock types, the distribution of known mineral deposits and the size of drainage sub-catchments (Cohen et al. 1995, 1999). Specific sites were selected on major creeks or rivers at the outlet point of discrete, closed drainage basins. At these locations, 5 kg of active channel stream sediment were collected. The stream sediments were sieved to <250 μm prior to analysis by X-ray fluorescence spectrometry and instrumental neutron activation analysis. This fraction is coarser than that typically used in exploration geochemical surveys and was intended to enhance the primary lithological signature of the stream sediment data, yet preserve secondary (hydromorphic) geochemical dispersion patterns.

DATA PROCESSINGK-means clustering

Normal probability plotting of the raw data indicated both the existence of multiple populations and a positive skewness to the main body of data. Preliminary testing of k-means clustering on the raw data resulted in isolation of very few anomalies and poor separation of the major lithologies between clusters. Each variable was, therefore, cleaned and normalized by removing gross outliers plus a further 5% of observations from the upper end of the distribution, and determining the Box-Cox transformation [x′=(xλ−1)/λ] that reduced skewness to 0 (Box & Cox 1964). The excluded values were then returned to the dataset and the transformation applied. Other methods of robust estimation could also have been used.

Selection of the optimum number of clusters was determined after testing selected groups of variables for cluster numbers ranging from six to 20. In the case of Cu–Pb–Zn, extending the number of clusters from six to eight had no significant effect on either the main groupings of observations (the first three main clusters) or on the identification of anomalies (Fig. 4). Extending from eight to ten clusters resulted in a slight reduction in the number of anomalies (from 22 to 18) and sub-division of the main clusters into overlapping groups (Fig. 5). Further increase in the number of clusters served only to subdivide the small anomalous clusters into even smaller clusters or to isolate anomalous values, rather than to split the main clusters into sub-clusters.

UNN clustering

In order to analyse the stream sediment geochemical results from each catchment, the polygon (catchment boundary) attributed data were converted to a grid. This was a requirement of the software used rather than an intrinsic requirement of UNN methods. The smallest catchment in the study area was c. 2500 m2 and a grid cell of 400 m2 was deemed to be sufficient to accurately represent the geochemical signatures of each catchment. To prevent creation of artificial values between sparsely distributed sample points, the catchment polygons were rasterized so that each grid cell within a given catchment was given the same geochemical value (a procedure discussed by Bonham-Carter 1994). The resulting input for the neural network involved 300 000 cells by 33 elements. The UNN clustering was run initially on all 33 elements from the stream sediment dataset to establish the main associations. No information on sub-catchment geology or the distribution of known mineralization occurrences was provided to the UNN.

For the complete set of variables, there were 14 common clusters and a number of clusters for which very few observations displayed a high index of similarity (i.e. UNN anomalies). The common clusters were closely related to the distribution of the major lithological groups in the area and the main element associations, as defined by the normalized weights (loadings) for each cluster, were:

Ba–Sr–Rb–Na–K–Pb–Eu±Ga Feldspar association

Cu–Zn±Pb Base metal mineralization association

Fe–Co–Cu–Ni–Cr±Zn Mafic/ultramafic and Fe-oxide adsorption association

REE–Nb–Ta–Hf–Zr–Th Heavy mineral association

These associations closely agreed with those established using factor analysis on the transformed dataset (Cohen et al. 1995). Both the UNN and k-means clustering were rerun on these four associations to evaluate both cluster patterns and the distribution of anomalies (Clare & Cohen 1999). To demonstrate the relative performance of the k-means clustering and the UNN clustering, the distribution of patterns for the feldspar association and the anomaly detection results for the base metal association will be compared.

RESULTSThe feldspar association

The structure of the k-means and UNN clusters are presented in Fig. 6 in terms of the mean and standard deviation for each (transformed) element of a k-means cluster and the normalized value (or vector) representing a UNN cluster. The distribution of the allocated k-means clusters and an index of the similarity between selected UNN cluster vectors and individual catchment (grid cell) data are presented in Figs 7 to 9. The correspondence between k-means and UNN clusters and their relationship to the regional lithologies is summarized in Table 1 and the distribution of k-means clusters relative to catchment lithological group is indicated in Table 2.

Both the k-means and UNN clustering display a very strong lithological or geomorphic association. The clustering defines four main spatial associations: the granites, the accretionary complex, the Tertiary basalts and the Clarence-Moreton Basin sedimentary rocks.

Granites

The distribution of catchments allocated to k-means cluster 3 shows a high degree of similarity to those displaying a strong similarity index to UNN cluster 2 (Figs 7 and 8) and are characterized by low Ba, Eu and Sr values and high K, Na and Pb (Fig. 6). These clusters are restricted mainly to the granites along the western boundary of the area, with 58% of k-means cluster 3 being contained within granite-dominated catchments and 36% in areas of alluvium. K-means cluster 4 is more evenly distributed amongst granites, metasedimentary rocks and alluvium. It covers some of the less differentiated, plagioclase-rich granites and has higher Ba and Eu loadings but lower K contents than cluster 3.

Accretionary complex

The accretionary complex differs from the granitic clusters in its higher Ba and Eu values and lower K, Pb and Rb values. Whereas this region is covered by a single k-means cluster (cluster 5 for which 68% of the catchments are in the accretionary complex), there are three UNN clusters associated with this region. The UNN clusters display a subtle variation in geochemical signature and spatial distribution, and form three contiguous sub-regions in the accretionary complex. UNN cluster 1 (Fig. 8), for example, is characterized by more elevated Ba values than the other two clusters and covers the southwestern part of the complex. UNN cluster 3 is characterized by lower Eu, K, Na, Rb and Sr values than UNN cluster 1 and is most strongly reflected in the southeastern and eastern sections of the complex, including the Gundahl Complex (a tectonic melange of metasediments and greenstones of altered mafic volcanics) and the basal conglomerates of the Clarence-Moreton Basin.

Tertiary basalts

K-means cluster 2 and UNN clusters 5 and 8 delineate the main Tertiary basalt flows in the northeast, northwest and the south (Figs 7 and 8). There also appears to be an association between these clusters and the intermediate to basic volcanics and volcanoclastics southeast of Tenterfield and the Gordonbrook Serpentinite near Baryugil. The clusters are characterized by higher Ba, Eu and Sr and lower K and Pb values than the granites, reflecting the predominance of calcic plagioclase over potassic and sodic feldspars.

Clarence-Moreton Basin

K-means clusters 1 and 6 and UNN clusters 7, 9 and 10 are mainly restricted to the Clarence-Moreton Basin. All clusters are generally characterized by low loadings (or cluster means) for all six elements relative to the other lithological units. Catchments allocated to k-means cluster 1 are more dominant in the centre of the basin and the element loadings indicate the highly weathered and transported nature of the stream sediments. The UNN clusters display subtle variations in the composition of sediments from the edge to the centre of the basin with a significant decrease in Ba, Rb, Sr and Na towards the basin centre. UNN cluster 7 is most strongly related to marine clastic units and coal measures along the western and southern edges of the basin. The mid to upper sequences of the Clarence-Moreton Basin, defined by UNN clusters 9 and 10, are typified by very low Ba, Na, Pb and Sr and a slight K enrichment. UNN cluster 9 also shows a spatial association with the northeastern part of the accretionary complex (Fig. 8).

The base metal mineralization suite

The k-means and UNN cluster structures, and normalized values for the UNN anomalies, are presented in Fig. 9 and spatial patterns in Figs 10 and 11.

Spatial distributions

UNN cluster 1 is generally confined to the Gordonbrook Serpentinite (near Baryugil) and clusters 2 and 5 to the Tertiary basalts and the Clarence River estuary (which acts as both a mechanical and chemical ‘sink’ for most trace metals). These correspond with k-means cluster 6 and have high Cu means and low Pb means. A number of isolated catchments defined by UNN cluster 1 occur on a N–NW trend and included the Gordonbrook Serpentinite and the Gundahl Complex.

UNN clusters 6 to 10 and k-means clusters 2 and 5 represent the accretionary complex, with over 40% of allocated catchments located within metasediments and metavolcanics. This group of clusters exhibit similar loadings for Cu, Pb and Zn. The k-means clusters 2 and 5 are interspersed and would appear to relate to variations in Pb concentrations. Variations in the similarity between the sediment concentrations for Cu, Pb and Zn and the UNN clusters represent both subtle changes in lithological compositions within the complex and proximity to the main belt of base metal mineralization that extends from Jackadgery to Drake. The group of clusters covers virtually all the accretionary wedge units and some of the rhyodacitic units near Drake. The exception to this main grouping of clusters is cluster 8, which appears to delineate some of the intrusive units either marginal to or intruding the accretionary complex.

UNN cluster 3 and k-means cluster 3 have very low Cu and Zn values, moderate to low Pb values, and define the granites along the western edge of the region and the western side of the Clarence-Moreton Basin. UNN cluster 4 and k-means cluster 1 are characterized by low Pb, and very low Zn and Cu loadings or means. They are difficult to assign to any specific lithological group and their distribution appears to be structurally or topographically controlled. The most obvious association is with the main drainage of the upper Clarence River and coincidence with the western margin of the Clarence-Moreton Basin (57% of k-means cluster 1 catchments are located in areas of alluvium).

Anomaly detection

The UNN detected ten common clusters and one composite group of 26 anomalous catchments. The distribution of the anomalous catchments, and the medium and small sized mineral deposits recorded in the area are shown in Fig. 12. The relative performance of (i) k-means clustering on log-transformed data (ii) k-means clustering on Box-Cox transformed data and (iii) UNN clustering on raw data is summarized in Fig. 13.

For log-transformed data, the k-means clustering identified nine anomalies (clusters with <15 observations out of a total pool of 1670 observations), whereas 19 anomalies were detected for the transformed data. By contrast, UNN detected 26 anomalies of which 17 were also identified by the k-means clustering. In nearly all cases, the observations deemed anomalous by both methods were also detectable on the basis of their individual Cu, Pb or Zn values. The nine anomalies that were only identified by UNN include a series of six observations with low Cu, Zn and Pb values, together with a further three contained within the main k-means clusters. Conversely, two samples were determined to be anomalous under k-means clustering but not UNN. Most of the observations within three units of the joint Cu–Zn mean were only identified as being anomalous on the basis of their combined Cu–Pb–Zn signature.

The UNN anomalies fall into three distinct groups (Fig. 10); (1) high-value multivariate anomalies, (2) low-value multivariate anomalies, and (3) anomalies associated with individual variables. In group 1, anomalies U/K4, U/K5a–g and U/K17 are characterized by extremely high values for all three elements. U/K4 and U/K5 incorporate catchments containing known base metal and Au mineral deposits. U/K5 incorporates a series of seven catchments that rim the southern side of a plateau, which contains the Timbarra Gold Mine and a number of adjacent base metal mineralization zones. Anomalies U/K6,7,9,11,12 contain at least two variables with highly elevated values. U/K7 includes catchments surrounding the major copper mineralization in the area at the Cangai Copper Mine and other smaller workings. Anomalies U/K9–12 and U/K17 contain no known mineralization but are found within the vicinity of Tertiary basalts, south of the axis of accretionary complex base metal deposits that extend from Dorrigo to Glencoe.

The low-value multivariate anomalies, U/K2 and U/K18, coincide with the basal units of the Clarence-Moreton Basin but differ from the main cluster delineating these units by the absence of Cu and the strongly elevated Pb values. Anomalies U14–16 all contain anomalously low metal contents and were from areas of low topographic relief in the Clarence-Moreton Basin. They are not identified as anomalous by the k-means clustering.

The remaining anomalies are ‘non-outliers’. Anomaly U1 is characterized by intermediate Cu and Zn values and low Pb. The underlying lithology for the catchment is wholly Tertiary basalts. The main difference between the Tertiary Basalt cluster and this anomalous catchment is the strong Pb depletion. U3 is characterized by moderate to low Cu and Zn values and low Pb. This catchment is located between the Timbarra and Drake mineral fields, which contain a number of base metal occurrences. U/K6 is within a catchment containing the small Snapes Lode deposit (a Ag–Au–Pb–Zn occurrence with no recorded production history) and U8 coincides with a leucocratic granitic stock and associated Bi–Mo mineralization. The anomalous character of the catchment (although not due to high values) may be due to the association with the known mineralization.

Anomaly U/K9 is characterized by moderate to high Pb and Cu and moderate Zn values. Two small known Cu mines existed in the vicinity of the catchment. U10 characterized by moderate to high values for Cu, Pb and Zn is not found to be associated with any recorded mineralization, the catchment being contained wholly within Tertiary basalts. U13 and K21 are characterized by moderate Pb and Zn values and low Cu, with U13 in the main catchment for the town of Grafton. The elevated Pb and Zn values, compared to the surrounding catchments, are possibly due to the urban and localized industrial development associated with the town at the head of the estuarine portion of the Clarence River. Anomalies U8,10,13,19 are unique to the UNN clustering, and anomalies K20 and K21 to the k-means clustering.

DISCUSSION

Two key aspects of processing of regional geochemical datasets are (i) establishing common patterns (between variables or samples) that may be related to variations in geological characteristics or geochemical processes operating in a region; and (ii) identifying anomalies. In most cases, such objectives require a reduction in the dimensionality of the dataset.

Traditional methods of dimension reduction, such as factor analysis, or multivariate sample clustering and anomaly detection, such as k-means clustering, generally require both pre-processing of datasets and some degree of a priori knowledge of the structure of the data (i.e. the number of factors or clusters to extract). For k-means clustering this may require some degree of trial-and-error. Whereas part of the function of UNN clustering is to establish objectively the number of clusters present in the dataset, there still remains the question as to whether the number of clusters formed is ‘optimum’. In this study, the question of performance has been resolved simply in terms of the number of anomalies detected by the two methods and the extent to which the common clusters relate to the geology and topography of the region.

For the transformed Cu–Pb–Zn data, varying the number of clusters and observing the subsequent grouping of samples indicated a maximum of five main clusters (each with >200 observations) dominated by the main lithological groups and a set of 17 observations in small clusters (anomalies). Beyond a certain value for k, an increase in clusters served only to subdivide the anomalies into smaller clusters and progressively subdivide the larger (common) clusters without suitable criteria to establish a limit on the number of clusters. For the feldspar-associated elements, variation in the value for k had a similar effect. Under UNN clustering, transformation of the data (such as log-transformation) or exclusion of anomalous observations, prior to modelling, had no significant effect on the number of common clusters or the number of anomalies.

As noted by Kaski (1997), although the k-means clustering algorithm and SOM are very closely related, the number k-means clusters initially chosen should accord with the number of (real) clusters present in the dataset, whereas under SOM procedures this is not necessary (under supervised neural networks the number of reference vectors can be chosen to be much larger than the number of actual clusters in the data). The direct result of UNN clustering and self-organizing of data is to assign all clusters with few observations to a multicharacter anomalous group. It is the lack of commonality with the common patterns that defines observations as being anomalous.

Comparing the performance of the two methods, both k-means clustering (on transformed data) and UNN self-organizing of the data (without removal of outliers or data transformation), were clearly able to define the common patterns within the whole population. These common patterns were consistent with the geology of the catchments. Both methods also identified a number of anomalous catchments, using Cu–Pb–Zn, that contained known base metal mineralization.

UNN was able to show greater subtlety in geochemical patterns than the classical k-means technique. This is not just a function of the different approach to the linking of observations to clusters (degree of similarity for UNN; absolute grouping for k-means clustering), but may relate to restriction of the shape of k-means clusters to hyperelipsiods. A greater equivalence between the output from the UNN and k-means clustering could be achieved by determining the probability of a sample being allocated to a given k-means clusters by relating the distance of an observation to the k-means cluster centroid using a measurement such as the Mahalanobis D2.

Whereas both methods could identify outlier anomalies (observation with one or more elements displaying high values), only the UNN could identify non-outlier anomalies (observations within the common spread of values for each element, but where there is a natural hiatus in data continuity in the original n-dimensional space). The UNN anomalies could be more easily subdivided into groups displaying some degree of similarity.

In terms of utility, the UNN technique tested was superior to the k-means clustering. The actual analytical time taken to define clusters and anomalous catchments from presentation of the raw data to the UNN was in the order of minutes. The k-means clustering required some hours of data manipulation and pre-processing prior to running the models.

CONCLUSION

The application of unsupervised neural network described in this study demonstrates the effectiveness of a non-linear, non-parametric approach to the analysis of geochemical data. Unlike conventional clustering methods, the patterns in the data established using UNN appears to be relatively unaffected by the presence of outlying values and non-normal populations, so commonly exhibited by geochemical datasets such as the one examined in this study. The k-means clustering produced similar results to the UNN clustering; however, it proved less efficient at identifying anomalous geochemical signatures in the stream sediment data.

The UNN has proved capable of extracting both common geochemical patterns and the subset of anomalous patterns by reducing the complex higher dimensionality raw data layers into lower dimensionality ordered groups. Defining complex relationships within the data, using UNN, was efficiently performed without any a priori knowledge of variable characteristics, relationships between variables, or spatial correlations between samples.

The study demonstrates unsupervised neural networks to be a viable alternative to conventional statistical approaches in the modelling of multivariate geochemical data.

+

The authors wish to acknowledge R. Barnes, K. McDonald and the NSW Department of Mineral Resources for the supply of the digital geology and mineral deposit datasets. Surtec Pty. Ltd, J. Yong, E. Fellenberg and I. Wainwright provided valuable assistance in the project. Critique of the neural methods was provided by T. Gedeon. The reviewers are also thanked for their comments.

+
+ + BARNES, R. G., BROWNLOW, J, ALDER, D. & 6 others 1996. Mineral resources. In: Regional Report of Upper North East New South Wales, Volume 5; Socio-economic Values. Resource and Conservation Assessment Council, Sydney, 73–154. + + + BENEDIKTSSON, J. A., SWAIN, P. H. & ERSOY, O. K. 1990. Neural network approaches versus statistical methods in classification of multisource remote sensing data. IEEE Transactions on Geoscience and Remote Sensing 4, 540-552. + + + BONHAM-CARTER, G. F. 1994. Geographic Information Systems for Geoscientists: Modelling with GIS. Pergamon Publishing, Oxford. + + + BOX, G. E. P. & COX, D. R. 1964. An analysis of transformations. Journal of the Royal Statistical Society, Series B 26, 211-243. + + + CHENG, Q., AGTERBERG, F. P. & BONHAM-CARTER, G. F. 1996. A spatial analysis method for geochemical anomaly separation. Journal of Geochemical Exploration 56, 183-195. + + + CHORK, C. Y. & ROUSSEEUW, P. J. 1992. Integrating a high-breakdown option into discriminant analysis in exploration geochemistry. Journal of Geochemical Exploration 43, 191-203. + + + CHORK, C. Y. 1990. Unmasking multivariate anomalous observations in exploration geochemical data from sheeted vein tin mineralization near Emmaville, N.S.W, Australia. Journal of Geochemical Exploration 37, 205-223. + + + CLARE, A. P. & COHEN, D. R. 1999. An unsupervised neural network approach to the analysis of multi-element stream sediment data, northeastern NSW, Australia. In: Proceedings of the 19th International Geochemical Exploration Symposium, Vancouver, 12–16 April, 1999. + + + COHEN, D. R., RUTHERFORD, N. F. & GARNETT, D. L. 1995. A Geochemical Survey of the Upper Northeast Region, New South Wales. NSW Department of Mineral Resources, Sydney. + + + COHEN, D. R., SILVA-SANTISTEBAN, C. M, RUTHERFORD, N. F, GARNETT, D. L. & WALDRON, H. M. 1999. Comparison of vegetation and stream sediment geochemical patterns in the north eastern region of New South Wales. Journal of Geochemical Exploration 66, 469-489. + + + DAYHOFF, J. E. 1990. Neural Network Architectures: An Introduction. Van Nostrand, New York. + + + FOODY, G. M. 1997. Fully fuzzy supervised classification of land cover from remotely sensed imagery with an artificial neural network. Neural Computing and Applications 5, 238-247. + + + GARRETT, R. G. 1989. The chi-square plot: a tool for multivariate outlier recognition. Journal of Geochemical Exploration 32, 319-341. + + + GILLIGAN, L. B. & BARNES, R. G. 1990. New England Fold Belt, New South Wales – Regional Geology and Mineralisation. In: Hughes, F.E. (ed) . Geology of the Mineral Deposits of Australia and Papua New Guinea. Australian Institute of Mining and Metallurgy. Monograph, 14, 1417-1423. + + + HOWARTH, R. J. 1983. Statistics and Data Analysis in Geochemical Prospecting. In: Govett, G.J.S. (ed) . Handbook of Exploration Geochemistry, Volume 2. Elsevier, Amsterdam. + + + KAMGAR-PARSI, B., GUALTIERI, J. A. & DEVANEY, J. E. 1990. Clustering with neural networks. Biological Cybernetics 63, 210-208. + + + KASKI, S. 1997. Data exploration using self-organizing maps. Acta Polytechnica Scandinavica, Mathematics, Computing and Management in Engineering Series, 82. + + + KOHONEN, T. 1989. Self-organisation and Associative Memory. Springer-Verlag, Berlin. + + + KOHONEN, T. 1990. The self-organizing map. Proceedings of the IEEE 78, 1464-1480. + + + KOHONEN, T. 1995. Self-Organizing Maps. Springer-Verlag, Berlin. + + + MACQUEEN, J. 1967. Some methods for classification and analysis of multivariate observations. In: Le Cam, L.M, Neyman, & J. (eds) . Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1. University of California Press, Berkeley281-297. + + + MANCEY, S. J. & HOWARTH, R. J. 1980. Power-transform removal of skewness from large data sets. Institute for Mining and Metallurgy Transactions, Section B 89, 92-97. + +

Combination of vector x1 . . . xn representing the multivariate signature of a cell from the original dataset, with the weight vector wi1 . . . win for the ith unit of the competitive layer.

UNN clustering of objects in n-dimensional space. Anomalies may be defined as clusters with very few observations.

Simplified geological map of the study area (after Barnes et al. 1996).

Comparison of cumulative percentages of observations clustered by k-means clustering of Cu–Pb–Zn under different values of k. Variables have been Box-Cox transformed.

Projection of k-means clusters onto Cu–Zn space for the base metal suite for six-, eight- and ten-cluster models. Ellipses represent the cluster μ±1σ for the transformed Cu and Zn variables.

Spread of values (μ±1σ) for the feldspar association under an eight-group k-means clustering and the normalized values for the ten common UNN clusters.

Spatial distribution of catchment group allocation under an eight-group k-means clustering of the feldspar-associated elements (Ba, Na, K, Sr, Eu, Pb).

Similarity index for UNN clusters 1, 2, 8 and 9 for the feldspar-associated elements (Ba, Na, K, Sr, Eu, Pb).

(a) Spread of values (μ±1σ) for elements within a 10-group k-means clustering of Cu–Pb–Zn; (b) Normalized values for the ten common UNN clusters of Cu–Pb–Zn and normalized values for the 16 UNN anomalies. Group A is anomalies in catchments with known mineralization, group B in catchments with no known significant mineralization and group C is a small set of observations with very low Cu+Zn±Pb values.

Allocation of k-means and UNN clusters to lithological associations for the feldspar (Ba–Eu–K–Na–Sr–Pb) and Cu–Pb–Zn associations


Cluster
Association
k-means
UNN
Lithological association
Feldspar1, 67, 9, 10Clarence-Moreton Basin
25, 8Tertiary basalts and ultramafics
3, 42Granites
51, 3, 4Metasedimentary and metavolcanic rocks (accretionary complex)
7, 8RemainderAnomalies
6None
Cu–Pb–Zn33Granites and basal Clarence-Moreton Basin
62, 5Tertiary basalts and Clarence River Estuary
61Serpentinite and meta-basic volcanics
5±26 to 10Accretionary complex
14Western Clarence River System
4, 7 to 10RemainderAnomalies

Cross tabulation of the percent distribution for each k-means cluster across the five main catchment geological groups for (a) feldspar association (Ba–Eu–K–Na–Pb–Sr) under an eight-cluster model and (b) Cu–Pb–Zn suite under a ten-cluster model. The anomalies represent the summation of clusters containing <2% of the total number of catchments


Geological Group
Mafics and ultramafics
Granites and volcanics
Metasediments
Alluvium

Total catchments
8.7
19.1
34.1
38.1
(a) Feldspar association
 Cluster14.31.128.066.7
230.414.619.635.4
31.058.14.836.2
42.637.630.729.1
50.48.863.827.1
613.65.830.450.3
 Anomalies8.717.443.530.4
(b) Cu–Pb–Zn association
 Cluster16.18.228.657.1
25.915.642.436.2
30.653.910.734.8
54.510.751.733.1
632.07.526.534.0
 Anomalies18.29.145.527.3

Spatial distribution of catchment group allocation under a ten-group k-means clustering of Cu–Pb–Zn.

Similarity index for UNN clusters 1, 2, 3 and 6, of Cu–Pb–Zn.

Distribution of anomalous catchments derived from k-means clustering (Kn) and UNN clustering (Un) of Cu–Pb–Zn, as well as the location of major occurrences of base metal mineralization.

(a) Plot of anomalous Cu–Pb–Zn clusters, determined using UNN clustering on raw data and k-means clustering of both log-transformed and Box-Cox transformed data. (b) Cross tabulations of background and anomalous samples identified by k-means and UNN clustering.

+ + + + + + + + + + + + + +
diff --git a/tests/stubdata/input/jats_indersci_url_ident.xml b/tests/stubdata/input/jats_indersci_url_ident.xml new file mode 100644 index 0000000..d12095e --- /dev/null +++ b/tests/stubdata/input/jats_indersci_url_ident.xml @@ -0,0 +1,67 @@ + + +
+ + +ijogct + +International Journal of Oil, Gas and Coal Technology + +1753-3309 +1753-3317 + +Inderscience Publishers (IEL) + + + +https://www.inderscienceonline.com/doi/10.1504/IJOGCT.2024.139531 + +Simulation study on frictional resistance and influencing factors of flexible screen pipe tripping into horizontal wellbore + + +Zhongzhi Hu1, +Junliang Li2, +Li Wang3, +Jinbo Wang4, +Xinyang Liu5, +Yang Kong6 +School of Mechanical Engineering, Sichuan University of Science & Engineering, No.1 Baita Road, Yibin, Sichuan, 644002, China +Production Engineering Research Institute, Daqing Oilfield, CNPC, No. 9 Xibin Road, Daqing, Heilongjiang, 163712, China +Production Engineering Research Institute, Daqing Oilfield, CNPC, No. 9 Xibin Road, Daqing, Heilongjiang, 163712, China +Dongying Ruifeng Petroleum Technical Development Co., Ltd., No. 5 Juzhou Road, Dongying, Shandong, 257299, China +Bulk Material Transportation Equipment Business Unit, Chengdu Gongbei Intelligent Technology Co., Ltd., No. 7 Guanghua Street, Chengdu, Sichuan, 610011, China +Dongying Ruifeng Petroleum Technical Development Co., Ltd., No. 5 Juzhou Road, Dongying, Shandong, 257299, China + + +2024 + + +03 +7 +2024 + +36 +1 +55 +76 + +Copyright © 2024 Inderscience Enterprises Ltd. +2024 + + + + +

Flexible sand control screens play a crucial role in the operation and economic benefits of ultra-short radius wells. We established a calculation model for the motion friction resistance of flexible screens and a criterion for contact between screen units and wellbore walls. Case analysis clarified the correlation between several key factors (e.g., maximum allowed swing angle, friction coefficient, insertion speed, and total length of screen units) and the frictional resistance of flexible screens. We utilised the overall distribution of contact point positions and the compressed distance between adjacent screen unit centroids to analyse the conversion mechanism of screen friction resistance from linear to nonlinear rapid accumulation. We also explored measures to improve the extension capacity of flexible screens. Our findings provide a basis for the optimisation design of flexible screen structures and the assessment of their maximum extension capacity. [Received: June 2, 2023; Accepted: February 13, 2024]

+
+ +flexible screen, +sand control, +multi-body dynamics simulation, +frictional resistance + + + + +
+
+
diff --git a/tests/stubdata/output/jats_gsl_unkeyed_xref.json b/tests/stubdata/output/jats_gsl_unkeyed_xref.json new file mode 100644 index 0000000..5d910f4 --- /dev/null +++ b/tests/stubdata/output/jats_gsl_unkeyed_xref.json @@ -0,0 +1,129 @@ +{ + "abstract": { + "textEnglish": "Isolation of complex patterns of correlation between variables, association among samples and anomaly identification, through conventional parametric multivariate statistical procedures, may be obscured by the presence of multivariate outliers and non-normal variable distributions. Procedures such as k-means clustering generally require substantial data pre-processing. Unsupervised neural networks (UNN) have the capacity to cluster multivariate data, using a modified form of the standard unsupervised Kohonen self-organizing map that is non-linear, non-parametric, rapid and robust. The number of clusters into which samples are allocated is determined by the unsupervised neural network and is directly dependent upon the original input data.\nUNN and k-means clustering was performed on stream sediment geochemical data from 1670 sub-catchments in the northeast region of New South Wales. Both methods produced clusters for the feldspar-associated elements that were closely related to sub-catchment geology and topography. UNN clustering revealed more subtle variations within the major lithological groups. UNN clustering of CuPbZn produced ten main clusters and identified 26 anomalies, that were mainly from sub-catchments, containing significant base metal mineralization occurrences. K-means clustering of transformed CuPbZn yielded five major clusters and only 19 anomalies. Progressive increase in k from eight to 20 did not substantially alter the k-means classification of samples between common groups and anomalies. Some catchments identified only as anomalous by UNN clustering contain known base metal mineralization." + }, + "authors": [ + { + "affiliation": [ + { + "affPubRaw": "Neural Mining Solutions, 1 Alfred Street, Sydney, NSW 2000, Australia" + } + ], + "name": { + "given_name": "A. P.", + "surname": "Clare" + } + }, + { + "affiliation": [ + { + "affPubRaw": "School of Geology, University of New South Wales, Sydney, NSW 2052, Australia" + } + ], + "attrib": { + "email": "d.cohen@unsw.edu.au" + }, + "name": { + "given_name": "D. R.", + "surname": "Cohen" + } + } + ], + "copyright": { + "statement": "\u00a9 2001 AAG/The Geological Society of London", + "status": true + }, + "keywords": [ + { + "keyString": "exploration geochemistry", + "keySystem": "misc" + }, + { + "keyString": "neural network", + "keySystem": "misc" + }, + { + "keyString": "clustering", + "keySystem": "misc" + }, + { + "keyString": "anomaly", + "keySystem": "misc" + }, + { + "keyString": "New England", + "keySystem": "misc" + } + ], + "pagination": { + "firstPage": "119", + "lastPage": "134", + "pageRange": "119-134" + }, + "persistentIDs": [ + { + "DOI": "10.1144/geochem.1.2.119" + } + ], + "pubDate": { + "printDate": "2001-05-00" + }, + "publication": { + "ISSN": [ + { + "issnString": "1467-7873", + "pubtype": "ppub" + }, + { + "issnString": "2041-4943", + "pubtype": "epub" + } + ], + "issueNum": "2", + "pubName": "Geochemistry: Exploration, Environment, Analysis", + "pubYear": "2001", + "publisher": "The Geological Society of London", + "volumeNum": "1" + }, + "publisherIDs": [ + { + "Identifier": "119", + "attribute": "publisher-id" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "JATS", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "references": [ + " BARNES, R. G., BROWNLOW, J, ALDER, D. & 6 others 1996. Mineral resources. In: Regional Report of Upper North East New South Wales, Volume 5; Socio-economic Values. Resource and Conservation Assessment Council, Sydney, 73154. ", + " BENEDIKTSSON, J. A., SWAIN, P. H. & ERSOY, O. K. 1990. Neural network approaches versus statistical methods in classification of multisource remote sensing data. IEEE Transactions on Geoscience and Remote Sensing 4, 540-552. ", + " BONHAM-CARTER, G. F. 1994. Geographic Information Systems for Geoscientists: Modelling with GIS. Pergamon Publishing, Oxford. ", + " BOX, G. E. P. & COX, D. R. 1964. An analysis of transformations. Journal of the Royal Statistical Society, Series B 26, 211-243. ", + " CHENG, Q., AGTERBERG, F. P. & BONHAM-CARTER, G. F. 1996. A spatial analysis method for geochemical anomaly separation. Journal of Geochemical Exploration 56, 183-195. ", + " CHORK, C. Y. & ROUSSEEUW, P. J. 1992. Integrating a high-breakdown option into discriminant analysis in exploration geochemistry. Journal of Geochemical Exploration 43, 191-203. ", + " CHORK, C. Y. 1990. Unmasking multivariate anomalous observations in exploration geochemical data from sheeted vein tin mineralization near Emmaville, N.S.W, Australia. Journal of Geochemical Exploration 37, 205-223. ", + " CLARE, A. P. & COHEN, D. R. 1999. An unsupervised neural network approach to the analysis of multi-element stream sediment data, northeastern NSW, Australia. In: Proceedings of the 19th International Geochemical Exploration Symposium, Vancouver, 1216 April, 1999. ", + " COHEN, D. R., RUTHERFORD, N. F. & GARNETT, D. L. 1995. A Geochemical Survey of the Upper Northeast Region, New South Wales. NSW Department of Mineral Resources, Sydney. ", + " COHEN, D. R., SILVA-SANTISTEBAN, C. M, RUTHERFORD, N. F, GARNETT, D. L. & WALDRON, H. M. 1999. Comparison of vegetation and stream sediment geochemical patterns in the north eastern region of New South Wales. Journal of Geochemical Exploration 66, 469-489. ", + " DAYHOFF, J. E. 1990. Neural Network Architectures: An Introduction. Van Nostrand, New York. ", + " FOODY, G. M. 1997. Fully fuzzy supervised classification of land cover from remotely sensed imagery with an artificial neural network. Neural Computing and Applications 5, 238-247. ", + " GARRETT, R. G. 1989. The chi-square plot: a tool for multivariate outlier recognition. Journal of Geochemical Exploration 32, 319-341. ", + " GILLIGAN, L. B. & BARNES, R. G. 1990. New England Fold Belt, New South Wales Regional Geology and Mineralisation. In: Hughes, F.E. (ed) . Geology of the Mineral Deposits of Australia and Papua New Guinea. Australian Institute of Mining and Metallurgy. Monograph, 14, 1417-1423. ", + " HOWARTH, R. J. 1983. Statistics and Data Analysis in Geochemical Prospecting. In: Govett, G.J.S. (ed) . Handbook of Exploration Geochemistry, Volume 2. Elsevier, Amsterdam. ", + " KAMGAR-PARSI, B., GUALTIERI, J. A. & DEVANEY, J. E. 1990. Clustering with neural networks. Biological Cybernetics 63, 210-208. ", + " KASKI, S. 1997. Data exploration using self-organizing maps. Acta Polytechnica Scandinavica, Mathematics, Computing and Management in Engineering Series, 82. ", + " KOHONEN, T. 1989. Self-organisation and Associative Memory. Springer-Verlag, Berlin. ", + " KOHONEN, T. 1990. The self-organizing map. Proceedings of the IEEE 78, 1464-1480. ", + " KOHONEN, T. 1995. Self-Organizing Maps. Springer-Verlag, Berlin. ", + " MACQUEEN, J. 1967. Some methods for classification and analysis of multivariate observations. In: Le Cam, L.M, Neyman, & J. (eds) . Proceedings of the Fifth Berkeley Symposium on Mathematical Statistics and Probability, Volume 1. University of California Press, Berkeley281-297. ", + " MANCEY, S. J. & HOWARTH, R. J. 1980. Power-transform removal of skewness from large data sets. Institute for Mining and Metallurgy Transactions, Section B 89, 92-97. " + ], + "title": { + "textEnglish": "A comparison of unsupervised neural networks and k-means clustering in the analysis of multi-element stream sediment data" + } +} diff --git a/tests/stubdata/output/jats_indersci_url_ident.json b/tests/stubdata/output/jats_indersci_url_ident.json new file mode 100644 index 0000000..42bc0d1 --- /dev/null +++ b/tests/stubdata/output/jats_indersci_url_ident.json @@ -0,0 +1,139 @@ +{ + "abstract": { + "textEnglish": "Flexible sand control screens play a crucial role in the operation and economic benefits of ultra-short radius wells. We established a calculation model for the motion friction resistance of flexible screens and a criterion for contact between screen units and wellbore walls. Case analysis clarified the correlation between several key factors (e.g., maximum allowed swing angle, friction coefficient, insertion speed, and total length of screen units) and the frictional resistance of flexible screens. We utilised the overall distribution of contact point positions and the compressed distance between adjacent screen unit centroids to analyse the conversion mechanism of screen friction resistance from linear to nonlinear rapid accumulation. We also explored measures to improve the extension capacity of flexible screens. Our findings provide a basis for the optimisation design of flexible screen structures and the assessment of their maximum extension capacity. [Received: June 2, 2023; Accepted: February 13, 2024]" + }, + "authors": [ + { + "affiliation": [ + { + "affPubRaw": "School of Mechanical Engineering, Sichuan University of Science & Engineering, No.1 Baita Road, Yibin, Sichuan, 644002, China" + } + ], + "name": { + "given_name": "Zhongzhi", + "surname": "Hu" + } + }, + { + "affiliation": [ + { + "affPubRaw": "Production Engineering Research Institute, Daqing Oilfield, CNPC, No. 9 Xibin Road, Daqing, Heilongjiang, 163712, China" + } + ], + "name": { + "given_name": "Junliang", + "surname": "Li" + } + }, + { + "affiliation": [ + { + "affPubRaw": "Production Engineering Research Institute, Daqing Oilfield, CNPC, No. 9 Xibin Road, Daqing, Heilongjiang, 163712, China" + } + ], + "name": { + "given_name": "Li", + "surname": "Wang" + } + }, + { + "affiliation": [ + { + "affPubRaw": "Dongying Ruifeng Petroleum Technical Development Co., Ltd., No. 5 Juzhou Road, Dongying, Shandong, 257299, China" + } + ], + "name": { + "given_name": "Jinbo", + "surname": "Wang" + } + }, + { + "affiliation": [ + { + "affPubRaw": "Bulk Material Transportation Equipment Business Unit, Chengdu Gongbei Intelligent Technology Co., Ltd., No. 7 Guanghua Street, Chengdu, Sichuan, 610011, China" + } + ], + "name": { + "given_name": "Xinyang", + "surname": "Liu" + } + }, + { + "affiliation": [ + { + "affPubRaw": "Dongying Ruifeng Petroleum Technical Development Co., Ltd., No. 5 Juzhou Road, Dongying, Shandong, 257299, China" + } + ], + "name": { + "given_name": "Yang", + "surname": "Kong" + } + } + ], + "copyright": { + "statement": "Copyright \u00a9 2024 Inderscience Enterprises Ltd.", + "status": true + }, + "keywords": [ + { + "keyString": "flexible screen", + "keySystem": "misc" + }, + { + "keyString": "sand control", + "keySystem": "misc" + }, + { + "keyString": "multi-body dynamics simulation", + "keySystem": "misc" + }, + { + "keyString": "frictional resistance", + "keySystem": "misc" + } + ], + "pagination": { + "firstPage": "55", + "lastPage": "76", + "pageCount": "21", + "pageRange": "55-76" + }, + "pubDate": { + "electrDate": "2024-07-03", + "printDate": "2024-00-00" + }, + "publication": { + "ISSN": [ + { + "issnString": "1753-3309", + "pubtype": "ppub" + }, + { + "issnString": "1753-3317", + "pubtype": "epub" + } + ], + "issueNum": "1", + "pubName": "International Journal of Oil, Gas and Coal Technology", + "pubYear": "2024", + "publisher": "Inderscience Publishers (IEL)", + "volumeNum": "36" + }, + "publisherIDs": [ + { + "Identifier": "https://www.inderscienceonline.com/doi/10.1504/IJOGCT.2024.139531", + "attribute": "url" + } + ], + "recordData": { + "createdTime": "", + "loadFormat": "JATS", + "loadLocation": "", + "loadType": "fromFile", + "parsedTime": "", + "recordOrigin": "" + }, + "title": { + "textEnglish": "Simulation study on frictional resistance and influencing factors of flexible screen pipe tripping into horizontal wellbore" + } +} diff --git a/tests/test_jats.py b/tests/test_jats.py index fde4c01..955f073 100644 --- a/tests/test_jats.py +++ b/tests/test_jats.py @@ -102,6 +102,8 @@ def test_jats(self): "jats_liebert_atypon", "jats_aip_native_strip", "jats_a+a_nested_collab", + "jats_indersci_url_ident", + "jats_gsl_unkeyed_xref", ] for f in filenames: