Skip to content

Commit

Permalink
Fix f2xx field generators (#184)
Browse files Browse the repository at this point in the history
* f264 $c generation validates value and uses only year instead of full date value.
* f264 $a may also be generated from dc.publisher.x-cityofpublication
* Disallow f264 duplication: only first entry of dc.title.alternative generates the field
* f264 $b now picks publisher name that aligns with title language if one is available and title language is defined
* Add f245 title splitting to title+subtitle if some of most common patterns match to title
* Enhance language detection for items and unify usage of detected language value between f008/f264
  • Loading branch information
aatuny authored Nov 5, 2024
1 parent f16c3f3 commit 97627da
Show file tree
Hide file tree
Showing 43 changed files with 463 additions and 62 deletions.
131 changes: 107 additions & 24 deletions src/transform/convert/common/generate2xx.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,24 @@
* @param {Object} ValueInterface containing getFields and getFieldValues functions
* @returns Empty array or array containing field 245 ($a)
*/
export function generate245({getFields, getFieldValues}) {
export function generate245({getFields}) {
const isAddedEntry = generateIsAddedEntry();
const [title] = getFieldValues('dc.title');
const ind1 = isAddedEntry ? '1' : '0';
const ind2 = ' '; // NB: generated in validation phase by marc-record-validators-melinda:IndicatorFixes

const fields = getFields('dc.title');
if (fields.length === 0) {
return [];
}

const titleText = fields.length > 0 ? fields[0].$.value : null;

const {title, alternativeSubtitle} = getTitle(titleText);

return alternativeSubtitle
? [{tag: '245', ind1, ind2, subfields: [{code: 'a', value: `${title} :`}, {code: 'b', value: `${alternativeSubtitle}.`}]}]
: [{tag: '245', ind1, ind2, subfields: [{code: 'a', value: `${title}.`}]}];

return title ? [
{
tag: '245',
ind1: isAddedEntry ? '1' : '0',
ind2: '0',
subfields: [{code: 'a', value: `${title}.`}]
}
] : [];

function generateIsAddedEntry() {
const fields = getFields(p => [
Expand All @@ -24,6 +30,42 @@ export function generate245({getFields, getFieldValues}) {

return fields.length > 0;
}

// Splits title to title+subtitle if title contains any of patterns that require this type of processing.
// Note: this getter is same as one defined within ONIX-transformer
function getTitle(titleText) {
const regexObj = findRegex(titleText);
const result = regexObj ? regexObj.regex.exec(titleText) : undefined;

if (!result) {
return {title: titleText.trimEnd(), alternativeSubtitle: undefined};
}

const titleResult = regexObj.keepResult === true ? {
title: (titleText.slice(0, result.index + regexObj.keepCharactersFromStart) + result).trimEnd(),
alternativeSubtitle: titleText.slice(result.index + result[0].length - regexObj.keepCharactersFromEnd).trimEnd().trimStart()
}
: {
title: titleText.slice(0, result.index + regexObj.keepCharactersFromStart).trimEnd(),
alternativeSubtitle: titleText.slice(result.index + result[0].length - regexObj.keepCharactersFromEnd).trimEnd().trimStart()
};

return titleResult;

function findRegex(titleText) {
// Note: order defines priority
const pluralOfRegex = [
// split title to mainTitle and subtitle at first ':', do not keep ':'
{keepCharactersFromStart: 0, keepCharactersFromEnd: 0, regex: /:\s+/u},
// split title to mainTitle and subtitle at first ' - ', do not keep the separator
{keepCharactersFromStart: 1, keepCharactersFromEnd: 1, regex: /[^0-9]\s+[\u2013\u2014-]\s+[^0-9]/u},
// split title to mainTitle and subtitle at '! ' or '? ', keep question and exclamation marks, they are part of the title
{keepCharactersFromStart: 0, keepCharactersFromEnd: 0, keepResult: true, regex: /!+|\?+/u}
];

return pluralOfRegex.find(({regex}) => regex.test(titleText));
}
}
}

/**
Expand All @@ -33,10 +75,12 @@ export function generate245({getFields, getFieldValues}) {
*/
export function generate246({getFieldValues}) {
const values = getFieldValues('dc.title.alternative');
return values.map(value => ({
tag: '246', ind1: '1', ind2: '3',
subfields: [{code: 'a', value}]
}));
return values.length > 0 ? [
{
tag: '246', ind1: '1', ind2: '3',
subfields: [{code: 'a', value: values[0]}]
}
] : [];
}

/**
Expand All @@ -58,8 +102,9 @@ export function generate250({getFieldValues}) {
* @param {Object} ValueInterface containing getFieldValues function
* @returns Empty array or array containing field 264 ($a, $b, $c)
*/
export function generate264({getFieldValues}) {
const subfields = generateSubfields();

export function generate264({getFields, getFieldValues}, titleLanguage) {
const subfields = generateSubfields(titleLanguage);

if (subfields.length > 0) {
return [
Expand All @@ -72,28 +117,66 @@ export function generate264({getFieldValues}) {
return [];


function generateSubfields() {
function generateSubfields(titleLanguage) {
const subfieldC = generateSubfieldC();
const subfieldB = generateSubfieldB(subfieldC.length > 0);
const subfieldB = generateSubfieldB(subfieldC.length > 0, titleLanguage);
const subfieldA = generateSubfieldA(subfieldB.length > 0, subfieldC.lenth > 0);

return subfieldA.concat(subfieldB, subfieldC);

function generateSubfieldA(hasSubfieldB, hasSubfieldC) {
const fieldSeparator = hasSubfieldB || hasSubfieldC ? ':' : '';
const values = getFieldValues('dc.publisher.place');

const dcPublisherPlaceValues = getFieldValues('dc.publisher.place');
const dcPublisherCityOfPublicationValues = getFieldValues('dc.publisher.x-cityofpublication');

const values = dcPublisherPlaceValues.length > 0 ? dcPublisherPlaceValues : dcPublisherCityOfPublicationValues;
return values.length > 0 ? [{code: 'a', value: `${values[0]}${fieldSeparator}`}] : [];
}

function generateSubfieldB(hasSubfieldC) {
function generateSubfieldB(hasSubfieldC, titleLanguage) {
const fieldSeparator = hasSubfieldC ? ',' : '';
const values = getFieldValues('dc.publisher');
return values.length > 0 ? [{code: 'b', value: `${values[0]}${fieldSeparator}`}] : [];

const fields = getFields('dc.publisher');

if (fields.length === 0) {
return [];
}

const languageVersionValue = titleLanguage ? fields.find(f => f.$.language === titleLanguage) : false;
const fieldValue = languageVersionValue ? languageVersionValue.$.value : fields[0].$.value;

return [{code: 'b', value: `${fieldValue}${fieldSeparator}`}];
}


/**
* Generates f264 $c from first encountered dc.date.issued that is in format of one of following:
* - YYYY
* - YYYY-MM
* - YYYY-MM-DD
*/
function generateSubfieldC() {
const values = getFieldValues('dc.date.issued');
return values.length > 0 ? [{code: 'c', value: `${values[0]}.`}] : [];
const dcValues = getFieldValues('dc.date.issued');
const validValues = dcValues.map(getYear).filter(v => v !== null);

return validValues.length > 0 ? [{code: 'c', value: `${validValues[0]}.`}] : [];


function getYear(v) {
const validFormats = [
/^\d{4}-\d{2}-\d{2}$/u,
/^\d{4}-\d{2}$/u,
/^\d{4}$/u
];

const valueIsValid = validFormats.some(re => re.test(v));
if (!valueIsValid) {
return null;
}

return v.slice(0, 4);
}
}
}
}
4 changes: 2 additions & 2 deletions src/transform/convert/common/generate2xx.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -94,13 +94,13 @@ function generate264() {
}
});

function callback({getFixture}) {
function callback({getFixture, titleLanguage = null}) {
const input = getFixture('input.json');
const output = getFixture('output.json');

const valueInterface = createValueInterface(input);

const result = fieldGenerator.generate264(valueInterface);
const result = fieldGenerator.generate264(valueInterface, titleLanguage);
expect(result).to.eql(output);
}
}
14 changes: 4 additions & 10 deletions src/transform/convert/common/generateControlFields.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import {formatLanguage} from '../util';

/**
* Generates static control field LDR
* @returns Field LDR as string
Expand Down Expand Up @@ -27,20 +25,21 @@ export function generate007() {
* - dc.type.ontasot
* - dc.language.iso
* @param {Object} ValueInterface containing getFieldValues and getFields functions
* @param {string|null} language language of item
* @param {Object} moment Moment instance to be used for date generation
* @returns Field 008 as string
*/
export function generate008({getFields, getFieldValues}, moment) {
export function generate008({getFields, getFieldValues}, language, moment) {
const timestamp = generateTimestamp();
const date = generateDate();
const country = generateCountry();
const contentNature = generateNatureOfContent();
const language = generateLanguage();
const lng = language || 'und';

return [
{
tag: '008',
value: `${timestamp}s${date} ${country} |||||o${contentNature}|||| ||${language} c`
value: `${timestamp}s${date} ${country} |||||o${contentNature}|||| ||${lng} c`
}
];

Expand All @@ -62,9 +61,4 @@ export function generate008({getFields, getFieldValues}, moment) {
const levels = getFields('dc.type.ontasot');
return levels.length > 0 ? 'm ' : '||||';
}

function generateLanguage() {
const values = getFieldValues('dc.language.iso');
return formatLanguage(values.slice(-1)[0]);
}
}
4 changes: 2 additions & 2 deletions src/transform/convert/common/generateControlFields.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ function generate008() {
}
});

function callback({getFixture}) {
function callback({getFixture, language = null}) {
const input = getFixture('input.json');
const output = getFixture('output.json');

const valueInterface = createValueInterface(input);
const momentMock = () => moment('2020-01-01T00:00:00');

const result = fieldGenerator.generate008(valueInterface, momentMock);
const result = fieldGenerator.generate008(valueInterface, language, momentMock);
expect(result).to.eql(output);
}
}
6 changes: 4 additions & 2 deletions src/transform/convert/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import {generate648, generate650, generate651, generate653} from './common/gener
import {generate776} from './common/generate7xx';
import {generate856, generate884} from './common/generate8xx';
import {generateSID, generateLOW} from './common/generateSystemFields';
import {getLanguage} from './util';

/**
* Generates MarcRecord using common field generations.
Expand All @@ -23,13 +24,14 @@ import {generateSID, generateLOW} from './common/generateSystemFields';
*/
export default ({harvestSource, fieldValueInterface, convertOpts = {}}) => {
const momentSource = convertOpts.moment || moment;
const titleLanguage = getLanguage(fieldValueInterface);

const marcRecord = new MarcRecord();
marcRecord.leader = generateLDR(); // eslint-disable-line functional/immutable-data

const fields = [
generate007(),
generate008(fieldValueInterface, momentSource),
generate008(fieldValueInterface, titleLanguage, momentSource),
generate020(fieldValueInterface),
generate024(fieldValueInterface),
generate040(),
Expand All @@ -39,7 +41,7 @@ export default ({harvestSource, fieldValueInterface, convertOpts = {}}) => {
generate245(fieldValueInterface),
generate246(fieldValueInterface),
generate250(fieldValueInterface),
generate264(fieldValueInterface),
generate264(fieldValueInterface, titleLanguage),
generate300(fieldValueInterface),
generate336(),
generate337(),
Expand Down
60 changes: 58 additions & 2 deletions src/transform/convert/util/index.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import langs from 'langs';
import LanguageDetect from 'languagedetect';

/**
* Creates interface that allows interacting with metadata given as input
Expand Down Expand Up @@ -40,15 +41,15 @@ export function createValueInterface(inputFields) {
/**
* Formats language based on given language code
* @param {string} code Language code to format
* @returns {string} Lang code as it was if seems valid, otherwise ISO 639-2B lang code or 'und' if not found
* @returns {string} Lang code as it was if seems valid, otherwise ISO 639-2B lang code or null if not found
*/
export function formatLanguage(code) {
if (code && code.length === 3) {
return code;
}

const lang = langs.where(1, code);
return lang ? lang['2B'] : 'und';
return lang ? lang['2B'] : null;
}

/**
Expand Down Expand Up @@ -259,3 +260,58 @@ export function parseIssnFromString(issnString) {
const [result] = issnString.match(issnRegex);
return result;
}

/**
* Wrapper for language getters. Prioritizes dc.language.iso > dc.title.$.language > language detection from title text
* @returns Lang code as it was if seems valid, otherwise ISO 639-2B lang code or null if not found
*/
export function getLanguage({getFields, getFieldValues}) {
const languageIsoValues = getFieldValues('dc.language.iso');
if (languageIsoValues.length > 0) {
return formatLanguage(languageIsoValues[0]);
}

const titleLanguage = getTitleLanguage({getFields});
if (titleLanguage) {
return titleLanguage;
}

return detectLanguage({getFieldValues});
}

/**
* Getter for title language. May return only fin/eng/swe or null if language attribute cannot be found/value is not one of known valid values.
*/
export function getTitleLanguage({getFields}) {
const validLangs = ['en', 'sv', 'fi'];

const fields = getFields('dc.title');
const language = fields.length > 0 ? fields[0].$.language : null;
return validLangs.includes(language) ? formatLanguage(language) : null;
}

/**
* Detect title language using language detection. Returns only fin/eng/swe or null.
*/

export function detectLanguage({getFieldValues}) {
const validLangs = ['eng', 'swe', 'fin'];

const lngDetector = new LanguageDetect();
lngDetector.setLanguageType('iso3');

const titleFields = getFieldValues('dc.title');
if (titleFields.length === 0) {
return null;
}

const [title] = titleFields;
const detectedTitleLanguages = lngDetector.detect(title, 1).flat().map(([lang, ...rest]) => lang); // eslint-disable-line no-unused-vars

if (detectedTitleLanguages.length === 0) {
return null;
}

const [lang] = detectedTitleLanguages;
return validLangs.includes(lang) ? formatLanguage(lang) : null;
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"$": {
"schema": "dc",
"element": "title",
"value": "Dublin Core - MARC21 -konversio"
"value": "Dublin Core"
}
}
]
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[
{
"tag": "245", "ind1": "0", "ind2": "0",
"tag": "245", "ind1": "0", "ind2": " ",
"subfields": [
{"code": "a", "value": "Dublin Core - MARC21 -konversio."}
{"code": "a", "value": "Dublin Core."}
]
}
]
Loading

0 comments on commit 97627da

Please sign in to comment.