-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlanguage_data.m
110 lines (77 loc) · 3.79 KB
/
language_data.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
datadir = '/project/3011044.02/raw/stimuli/';
subtlex_table_filename = fullfile(datadir, 'worddata_subtlex.mat');
subtlex_firstrow_filename = fullfile(datadir, 'worddata_subtlex_firstrow.mat');
load(subtlex_table_filename);
load(subtlex_firstrow_filename);
audiodir = '/project/3011044.02/lab/pilot/stim/audio';
stories = streams_util_stories();
num_stories = numel(stories);
data = cell(1, 7);
data{1, 1} = 'story_ID';
data{1, 2} = 'word';
data{1, 3} = 'story_nr';
data{1, 4} = 'sentence_nr';
data{1, 5} = 'word_nr';
data{1, 6} = 'entropy';
data{1, 7} = 'perplexity';
%% story loop
sprintf('Creating data cell array via donders_combine_textgrid() for %d stories ...\n', num_stories)
for i = 1:num_stories
story = stories{i};
dondersfile = fullfile(audiodir, story, [story, '.donders']);
textgridfile = fullfile(audiodir, story, [story, '.TextGrid']);
% read in the model output
datatmp = combine_donders_textgrid(dondersfile, textgridfile);
num_words = numel(datatmp);
storytable = cell(num_words, 7);
storytable(:, 1) = {story}; % story label string
storytable(:, 2) = [datatmp(:).word]; % word string
storytable(:, 3) = {i}; % story index
storytable(:, 4) = {datatmp(:).sent_}; % sentence index
storytable(:, 5) = {datatmp(:).word_};
storytable(:, 6) = {datatmp(:).entropy};
storytable(:, 7) = {datatmp(:).perplexity};
% Append current story table rows to data table
data = vertcat(data, storytable);
end
% % log transform perplexity values
% perplexity_col = find(strcmp(data(1, :), 'perplexity'));
% data(2: end, 7) = cellfun(@log10, data(2:end, perplexity_col), 'UniformOutput', 0);
%
% data(1, perplexity_col) = {'lg10perp'};
data(1, 8) = {'lg10wf'}; % create the column for subtlex wordfrequencies
data(1, 9) = {'nchar'};
%% LOOK UP WORD FREQUENCIES AND NUMBER OF CHARACTERS IN SUBTLEX DATA
sprintf('Adding word frequencies and word length via add_subtlex() for %d stories and %d words in total...\n', num_stories, size(data, 1) - 1)
data = add_subtlex(data, subtlex_data, subtlex_firstrow);
%% SAVING
sprintf('saving \n %s \n %s \n %s \n', fullfile(datadir, 'language_data.mat'), fullfile(datadir, 'language_data.txt'), fullfile(datadir, 'language_data-noheader.txt'))
data_table = cell2table(data(2:end,:), 'VariableNames', data(1, :));
save(fullfile(datadir, 'language_data.mat'), 'data_table');
writetable(data_table, fullfile(datadir, 'language_data.txt'),'Delimiter', ',');
writetable(data_table, fullfile(datadir, 'language_data-noheader.txt'),'Delimiter', ',', 'WriteVariableNames', 0);
%% SUBFUNCTION
function [data] = add_subtlex(data, subtlex_data, subtlex_firstrow)
num_words = size(data, 1); % because the first row is header
data_word_column = find(strcmp(data(1,:), 'word')); % find where story words are
% check subtlex header for correct colums
subtlex_word_column = find(strcmp(subtlex_firstrow, 'spelling'));
subtlex_wlen_column = find(strcmp(subtlex_firstrow, 'nchar'));
subtlex_frequency_column = find(strcmp(subtlex_firstrow, 'Lg10WF'));
subtlex_words = subtlex_data(:, subtlex_word_column);
% WORD LOOP
for j = 2:num_words
word = data(j, data_word_column);
word = word{1};
% find the row index in subtlex data table
row = find(strcmp(subtlex_words, word));
if ~isempty(row)
% lookup the according frequency values
data{j, 8} = subtlex_data{row, subtlex_frequency_column};
data{j, 9} = subtlex_data{row, subtlex_wlen_column};
else % if it is a punctuation mark or a proper name etc., write nan (punctuation marks etc. do not appear in the subtlex table)
data{j, 8} = nan;
data{j, 9} = nan;
end
end
end