Difference between revisions of "Module:Citation/CS1/Identifiers"
Module:Citation/CS1/Identifiers (view source)
Revision as of 16:42, 25 November 2023
, 16:42, 25 November 2023sync from sandbox;
m (1 revision imported: Content from Template:Reflist on Wikipedia) |
(sync from sandbox;) |
||
Line 164: | Line 164: | ||
--[=[-------------------------< I S _ V A L I D _ | --[=[-------------------------< I S _ V A L I D _ R X I V _ D A T E >------------------------------------------ | ||
returns true if: | for biorxiv, returns true if: | ||
2019-12-11T00:00Z <= biorxiv_date < today + 2 days | 2019-12-11T00:00Z <= biorxiv_date < today + 2 days | ||
for medrxiv, returns true if: | |||
2020-01-01T00:00Z <= medrxiv_date < today + 2 days | |||
The dated form of biorxiv identifier has a start date of 2019-12-11. The Unix timestamp for that date is {{#time:U|2019-12-11}} = 1576022400 | The dated form of biorxiv identifier has a start date of 2019-12-11. The Unix timestamp for that date is {{#time:U|2019-12-11}} = 1576022400 | ||
The medrxiv identifier has a start date of 2020-01-01. The Unix timestamp for that date is {{#time:U|2020-01-01}} = 1577836800 | |||
<rxiv_date> is the date provided in those |biorxiv= parameter values that are dated and in |medrxiv= parameter values at time 00:00:00 UTC | |||
today is the current date at time 00:00:00 UTC plus 48 hours | <today> is the current date at time 00:00:00 UTC plus 48 hours | ||
if today is | if today's date is 2023-01-01T00:00:00 then | ||
adding 24 hours gives | adding 24 hours gives 2023-01-02T00:00:00 – one second more than today | ||
adding 24 hours gives | adding 24 hours gives 2023-01-03T00:00:00 – one second more than tomorrow | ||
inputs: | |||
<y>, <m>, <d> – year, month, day parts of the date from the birxiv or medrxiv identifier | |||
<select> 'b' for biorxiv, 'm' for medrxiv; defaults to 'b' | |||
]=] | ]=] | ||
local function | local function is_valid_rxiv_date (y, m, d, select) | ||
local | if 0 == tonumber (m) and 12 < tonumber (m) then -- <m> must be a number 1–12 | ||
return false; | |||
end | |||
if 0 == tonumber (d) and 31 < tonumber (d) then -- <d> must be a number 1–31; TODO: account for month length and leap yer? | |||
return false; | |||
end | |||
local rxiv_date = table.concat ({y, m, d}, '-'); -- make ymd date string | |||
local good1, good2; | local good1, good2; | ||
local | local rxiv_ts, tomorrow_ts; -- to hold Unix timestamps representing the dates | ||
local lang_object = mw.getContentLanguage(); | local lang_object = mw.getContentLanguage(); | ||
good1, | good1, rxiv_ts = pcall (lang_object.formatDate, lang_object, 'U', rxiv_date); -- convert rxiv_date value to Unix timestamp | ||
good2, tomorrow_ts = pcall (lang_object.formatDate, lang_object, 'U', 'today + 2 days' ); -- today midnight + 2 days is one second more than all day tomorrow | good2, tomorrow_ts = pcall (lang_object.formatDate, lang_object, 'U', 'today + 2 days' ); -- today midnight + 2 days is one second more than all day tomorrow | ||
if good1 and good2 then -- lang.formatDate() returns a timestamp in the local script which tonumber() may not understand | if good1 and good2 then -- lang.formatDate() returns a timestamp in the local script which tonumber() may not understand | ||
rxiv_ts = tonumber (rxiv_ts) or lang_object:parseFormattedNumber (rxiv_ts); -- convert to numbers for the comparison; | |||
tomorrow_ts = tonumber (tomorrow_ts) or lang_object:parseFormattedNumber (tomorrow_ts); | tomorrow_ts = tonumber (tomorrow_ts) or lang_object:parseFormattedNumber (tomorrow_ts); | ||
else | else | ||
Line 200: | Line 209: | ||
end | end | ||
return (( | local limit_ts = ((select and ('m' == select)) and 1577836800) or 1576022400; -- choose the appropriate limit timesatmp | ||
return ((limit_ts <= rxiv_ts) and (rxiv_ts < tomorrow_ts)) -- limit_ts <= rxiv_date < tomorrow's date | |||
end | end | ||
Line 250: | Line 261: | ||
--[[--------------------------< N O R M A L I Z E _ L C C N >-------------------------------------------------- | --[[--------------------------< N O R M A L I Z E _ L C C N >-------------------------------------------------- | ||
LCCN normalization ( | LCCN normalization (https://www.loc.gov/marc/lccn-namespace.html#normalization) | ||
1. Remove all blanks. | 1. Remove all blanks. | ||
2. If there is a forward slash (/) in the string, remove it, and remove all characters to the right of the forward slash. | 2. If there is a forward slash (/) in the string, remove it, and remove all characters to the right of the forward slash. | ||
Line 287: | Line 298: | ||
--[[--------------------------< A R X I V >-------------------------------------------------------------------- | --[[--------------------------< A R X I V >-------------------------------------------------------------------- | ||
See: | See: https://arxiv.org/help/arxiv_identifier | ||
format and error check arXiv identifier. There are three valid forms of the identifier: | format and error check arXiv identifier. There are three valid forms of the identifier: | ||
Line 367: | Line 378: | ||
if is_set (class) then | if is_set (class) then | ||
if id:match ('^%d+') then | if id:match ('^%d+') then | ||
text = table.concat ({text, ' [[//arxiv.org/archive/', class, ' ', class, ']]'}); -- external link within square brackets, not wikilink | text = table.concat ({text, ' [[https://arxiv.org/archive/', class, ' ', class, ']]'}); -- external link within square brackets, not wikilink | ||
else | else | ||
set_message ('err_class_ignored'); | set_message ('err_class_ignored'); | ||
Line 381: | Line 392: | ||
Validates (sort of) and formats a bibcode ID. | Validates (sort of) and formats a bibcode ID. | ||
Format for bibcodes is specified here: | Format for bibcodes is specified here: https://adsabs.harvard.edu/abs_doc/help_pages/data.html#bibcodes | ||
But, this: 2015arXiv151206696F is apparently valid so apparently, the only things that really matter are length, 19 characters | But, this: 2015arXiv151206696F is apparently valid so apparently, the only things that really matter are length, 19 characters | ||
Line 399: | Line 410: | ||
local access = options.access; | local access = options.access; | ||
local handler = options.handler; | local handler = options.handler; | ||
local ignore_invalid = options.accept; | |||
local err_type; | local err_type; | ||
local err_msg = ''; | local err_msg = ''; | ||
Line 421: | Line 433: | ||
if id:find('&%.') then | if id:find('&%.') then | ||
err_type = cfg.err_msg_supl.journal; -- journal abbreviation must not have '&.' (if it does it's missing a letter) | err_type = cfg.err_msg_supl.journal; -- journal abbreviation must not have '&.' (if it does it's missing a letter) | ||
end | |||
if id:match ('.........%.tmp%.') then -- temporary bibcodes when positions 10–14 are '.tmp.' | |||
set_message ('maint_bibcode'); | |||
end | end | ||
end | end | ||
end | end | ||
if is_set (err_type) then | if is_set (err_type) and not ignore_invalid then -- if there was an error detected and accept-as-written markup not used | ||
set_message ('err_bad_bibcode', {err_type}); | set_message ('err_bad_bibcode', {err_type}); | ||
options.coins_list_t['BIBCODE'] = nil; -- when error, unset so not included in COinS | options.coins_list_t['BIBCODE'] = nil; -- when error, unset so not included in COinS | ||
end | end | ||
Line 456: | Line 470: | ||
local patterns = { | local patterns = { | ||
'^10.1101/%d%d%d%d%d%d$', -- simple 6-digit identifier (before 2019-12-11) | '^10%.1101/%d%d%d%d%d%d$', -- simple 6-digit identifier (before 2019-12-11) | ||
'^10.1101/(20 | '^10%.1101/(20%d%d)%.(%d%d)%.(%d%d)%.%d%d%d%d%d%dv%d+$', -- y.m.d. date + 6-digit identifier + version (after 2019-12-11) | ||
'^10.1101/(20 | '^10%.1101/(20%d%d)%.(%d%d)%.(%d%d)%.%d%d%d%d%d%d$', -- y.m.d. date + 6-digit identifier (after 2019-12-11) | ||
} | } | ||
Line 466: | Line 480: | ||
if m then -- m is nil when id is the six-digit form | if m then -- m is nil when id is the six-digit form | ||
if not | if not is_valid_rxiv_date (y, m, d, 'b') then -- validate the encoded date; 'b' for biorxiv limit | ||
break; -- date fail; break out early so we don't unset the error message | break; -- date fail; break out early so we don't unset the error message | ||
end | end | ||
Line 527: | Line 541: | ||
and terminal punctuation may not be technically correct but it appears, that in practice these characters are rarely | and terminal punctuation may not be technically correct but it appears, that in practice these characters are rarely | ||
if ever used in DOI names. | if ever used in DOI names. | ||
https://www.doi.org/doi_handbook/2_Numbering.html -- 2.2 Syntax of a DOI name | |||
https://www.doi.org/doi_handbook/2_Numbering.html#2.2.2 -- 2.2.2 DOI prefix | |||
]] | ]] | ||
Line 540: | Line 557: | ||
local text; | local text; | ||
if is_set (inactive) then | if is_set (inactive) then | ||
local inactive_year = inactive:match("%d%d%d%d") | local inactive_year = inactive:match("%d%d%d%d"); -- try to get the year portion from the inactive date | ||
local inactive_month, good; | local inactive_month, good; | ||
Line 551: | Line 568: | ||
end | end | ||
end | end | ||
end -- otherwise, |doi-broken-date= has something but it isn't a date | |||
if is_set (inactive_year) and is_set (inactive_month) then | if is_set (inactive_year) and is_set (inactive_month) then | ||
Line 568: | Line 583: | ||
local registrant_err_patterns = { -- these patterns are for code ranges that are not supported | local registrant_err_patterns = { -- these patterns are for code ranges that are not supported | ||
'^[^1-3]%d%d%d%d%.%d | '^[^1-3]%d%d%d%d%.%d+$', -- 5 digits with subcode (0xxxx, 40000+); accepts: 10000–39999 | ||
'^[^1- | '^[^1-6]%d%d%d%d$', -- 5 digits without subcode (0xxxx, 60000+); accepts: 10000–69999 | ||
'^[^1-9]%d%d%d%.%d | '^[^1-9]%d%d%d%.%d+$', -- 4 digits with subcode (0xxx); accepts: 1000–9999 | ||
'^[^1-9]%d%d%d$', -- 4 digits without subcode (0xxx); accepts: 1000–9999 | '^[^1-9]%d%d%d$', -- 4 digits without subcode (0xxx); accepts: 1000–9999 | ||
'^%d%d%d%d%d%d+', -- 6 or more digits | '^%d%d%d%d%d%d+', -- 6 or more digits | ||
'^%d%d?%d?$', -- less than 4 digits without subcode (with subcode is legitimate) | '^%d%d?%d?$', -- less than 4 digits without subcode (3 digits with subcode is legitimate) | ||
'^%d%d?%.[%d%.]+', -- 1 or 2 digits with subcode | |||
'^5555$', -- test registrant will never resolve | '^5555$', -- test registrant will never resolve | ||
'[^%d%.]', -- any character that isn't a digit or a dot | '[^%d%.]', -- any character that isn't a digit or a dot | ||
Line 595: | Line 611: | ||
if err_flag then | if err_flag then | ||
options.coins_list_t['DOI'] = nil; -- when error, unset so not included in COinS | options.coins_list_t['DOI'] = nil; -- when error, unset so not included in COinS | ||
else | |||
if not access and cfg.known_free_doi_registrants_t[registrant] then -- |doi-access=free not set and <registrant> is known to be free | |||
set_message ('maint_doi_unflagged_free'); -- set a maint cat | |||
end | |||
end | end | ||
Line 621: | Line 641: | ||
if ever used in HDLs. | if ever used in HDLs. | ||
Query string parameters are named here: | Query string parameters are named here: https://www.handle.net/proxy_servlet.html. query strings are not displayed | ||
but since '?' is an allowed character in an HDL, '?' followed by one of the query parameters is the only way we | but since '?' is an allowed character in an HDL, '?' followed by one of the query parameters is the only way we | ||
have to detect the query string so that it isn't URL-encoded with the rest of the identifier. | have to detect the query string so that it isn't URL-encoded with the rest of the identifier. | ||
Line 631: | Line 651: | ||
local access = options.access; | local access = options.access; | ||
local handler = options.handler; | local handler = options.handler; | ||
local query_params = { -- list of known query parameters from | local query_params = { -- list of known query parameters from https://www.handle.net/proxy_servlet.html | ||
'noredirect', | 'noredirect', | ||
'ignore_aliases', | 'ignore_aliases', | ||
Line 800: | Line 820: | ||
Determines whether an ISMN string is valid. Similar to ISBN-13, ISMN is 13 digits beginning 979-0-... and uses the | Determines whether an ISMN string is valid. Similar to ISBN-13, ISMN is 13 digits beginning 979-0-... and uses the | ||
same check digit calculations. See | same check digit calculations. See https://www.ismn-international.org/download/Web_ISMN_Users_Manual_2008-6.pdf | ||
section 2, pages 9–12. | section 2, pages 9–12. | ||
Line 849: | Line 869: | ||
like this: | like this: | ||
|issn=0819 4327 gives: [ | |issn=0819 4327 gives: [https://www.worldcat.org/issn/0819 4327 0819 4327] -- can't have spaces in an external link | ||
This code now prevents that by inserting a hyphen at the ISSN midpoint. It also validates the ISSN for length | This code now prevents that by inserting a hyphen at the ISSN midpoint. It also validates the ISSN for length | ||
Line 953: | Line 973: | ||
Format LCCN link and do simple error checking. LCCN is a character string 8-12 characters long. The length of | Format LCCN link and do simple error checking. LCCN is a character string 8-12 characters long. The length of | ||
the LCCN dictates the character type of the first 1-3 characters; the rightmost eight are always digits. | the LCCN dictates the character type of the first 1-3 characters; the rightmost eight are always digits. | ||
https://oclc-research.github.io/infoURI-Frozen/info-uri.info/info:lccn/reg.html | |||
length = 8 then all digits | length = 8 then all digits | ||
Line 1,008: | Line 1,028: | ||
return external_link_id ({link = handler.link, label = handler.label, q = handler.q, redirect = handler.redirect, | return external_link_id ({link = handler.link, label = handler.label, q = handler.q, redirect = handler.redirect, | ||
prefix = handler.prefix, id = lccn, separator = handler.separator, encode = handler.encode}); | prefix = handler.prefix, id = lccn, separator = handler.separator, encode = handler.encode}); | ||
end | |||
--[[--------------------------< M E D R X I V >----------------------------------------------------------------- | |||
Format medRxiv ID and do simple error checking. Similar to later bioRxiv IDs, medRxiv IDs are prefixed with a | |||
yyyy.mm.dd. date and suffixed with an optional version identifier. Ealiest date accepted is 2020.01.01 | |||
The medRxiv ID is a date followed by an eight-digit number followed by an optional version indicator 'v' and one or more digits: | |||
https://www.medrxiv.org/content/10.1101/2020.11.16.20232009v2 -> 10.1101/2020.11.16.20232009v2 | |||
]] | |||
local function medrxiv (options) | |||
local id = options.id; | |||
local handler = options.handler; | |||
local err_msg_flag = true; -- flag; assume that there will be an error | |||
local patterns = { | |||
'%d%d%d%d%d%d%d%d$', -- simple 8-digit identifier; these should be relatively rare | |||
'^10%.1101/(20%d%d)%.(%d%d)%.(%d%d)%.%d%d%d%d%d%d%d%dv%d+$', -- y.m.d. date + 8-digit identifier + version (2020-01-01 and later) | |||
'^10%.1101/(20%d%d)%.(%d%d)%.(%d%d)%.%d%d%d%d%d%d%d%d$', -- y.m.d. date + 8-digit identifier (2020-01-01 and later) | |||
} | |||
for _, pattern in ipairs (patterns) do -- spin through the patterns looking for a match | |||
if id:match (pattern) then | |||
local y, m, d = id:match (pattern); -- found a match, attempt to get year, month and date from the identifier | |||
if m then -- m is nil when id is the 8-digit form | |||
if not is_valid_rxiv_date (y, m, d, 'b') then -- validate the encoded date; 'b' for medrxiv limit | |||
break; -- date fail; break out early so we don't unset the error message | |||
end | |||
end | |||
err_msg_flag = nil; -- we found a match so unset the error message | |||
break; -- and done | |||
end | |||
end -- <err_msg_flag> remains set here when no match | |||
if err_msg_flag then | |||
options.coins_list_t['MEDRXIV'] = nil; -- when error, unset so not included in COinS | |||
set_message ('err_bad_medrxiv'); -- and set the error message | |||
end | |||
return external_link_id ({link = handler.link, label = handler.label, q = handler.q, redirect = handler.redirect, | |||
prefix = handler.prefix, id = id, separator = handler.separator, | |||
encode = handler.encode, access = handler.access}); | |||
end | end | ||
Line 1,069: | Line 1,135: | ||
elseif id:match('^%d+$') then -- no prefix | elseif id:match('^%d+$') then -- no prefix | ||
number = id; -- get the number | number = id; -- get the number | ||
if | if tonumber (id) > handler.id_limit then | ||
number = nil; -- | number = nil; -- unset when id value exceeds the limit | ||
end | end | ||
end | end | ||
Line 1,531: | Line 1,597: | ||
['JSTOR'] = jstor, | ['JSTOR'] = jstor, | ||
['LCCN'] = lccn, | ['LCCN'] = lccn, | ||
['MEDRXIV'] = medrxiv, | |||
['MR'] = mr, | ['MR'] = mr, | ||
['OCLC'] = oclc, | ['OCLC'] = oclc, |