-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathParser.cs
238 lines (198 loc) · 10.1 KB
/
Parser.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Xml;
using System.Xml.Schema;
using DocumentFormat.OpenXml.Packaging;
using Microsoft.Extensions.Logging;
using UK.Gov.Legislation.Judgments;
using UK.Gov.NationalArchives.CaseLaw.Parse;
using AkN = UK.Gov.Legislation.Judgments.AkomaNtoso;
using PS = UK.Gov.NationalArchives.CaseLaw.PressSummaries;
using AttachmentPair = System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>;
using ParseFunction = System.Func<byte[], UK.Gov.Legislation.Judgments.IOutsideMetadata, System.Collections.Generic.IEnumerable<System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>>, UK.Gov.Legislation.Judgments.AkomaNtoso.ILazyBundle>;
using AttachmentPair1 = System.Tuple<DocumentFormat.OpenXml.Packaging.WordprocessingDocument, UK.Gov.Legislation.Judgments.AttachmentType>;
using OptimizedParseFunction = System.Func<DocumentFormat.OpenXml.Packaging.WordprocessingDocument, UK.Gov.NationalArchives.CaseLaw.Parse.WordDocument, UK.Gov.Legislation.Judgments.IOutsideMetadata, System.Collections.Generic.IEnumerable<System.Tuple<DocumentFormat.OpenXml.Packaging.WordprocessingDocument, UK.Gov.Legislation.Judgments.AttachmentType>>, UK.Gov.Legislation.Judgments.Parse.Judgment>;
using UK.Gov.Legislation.Judgments.Parse;
namespace UK.Gov.NationalArchives.Judgments.Api {
public enum Hint { UKSC, EWCA, EWHC, UKUT, Judgment, PressSummary }
public class InvalidAkNException : System.Exception {
public InvalidAkNException(ValidationEventArgs cause) : base(cause.Message, cause.Exception) { }
}
public class Parser {
private static ILogger Logger = Logging.Factory.CreateLogger<Parser>();
private static AkN.Validator validator = new AkN.Validator();
/// <exception cref="InvalidAkNException"></exception>
public static Response Parse(Request request) {
if (request.Filename is not null)
Logger.LogInformation($"parsing { request.Filename }");
ParseFunction parse = GetParser(request.Hint);
IOutsideMetadata meta1 = (request.Meta is null) ? null : new MetaWrapper() { Meta = request.Meta };
IEnumerable<AttachmentPair> attachments = (request.Attachments is null) ? Enumerable.Empty<AttachmentPair>() : request.Attachments.Select(a => ConvertAttachment(a));
AkN.ILazyBundle bundle = parse(request.Content, meta1, attachments);
List<ValidationEventArgs> errors = validator.Validate(bundle.Judgment);
if (errors.Any())
throw new InvalidAkNException(errors.First());
string xml = SerializeXml(bundle.Judgment);
AkN.Meta aknMetadata = AkN.MetadataExtractor.Extract(bundle.Judgment);
Meta meta2 = ConvertInternalMetadata(aknMetadata);
Log(meta2);
List<Image> images = bundle.Images.Select(i => ConvertImage(i)).ToList();
bundle.Dispose();
return new Response() {
Xml = xml,
Meta = meta2,
Images = images
};
}
private static ParseFunction GetParser(Hint? hint) {
if (!hint.HasValue)
return JudgmentOrPressSummary;
if (hint.Value == Hint.Judgment)
return ParseAnyJudgment;
if (hint.Value == Hint.EWHC || hint.Value == Hint.EWCA)
return Wrap(OptimizedEWHCParser.Parse);
if (hint.Value == Hint.UKSC)
return Wrap(OptimizedUKSCParser.Parse);
if (hint.Value == Hint.UKUT)
return Wrap(OptimizedUKUTParser.Parse);
if (hint.Value == Hint.PressSummary)
return ParsePressSummary;
throw new Exception("unsupported hint: " + Enum.GetName(typeof(Hint), hint));
}
private static ParseFunction Wrap(OptimizedParseFunction f) {
return (byte[] docx, IOutsideMetadata meta, IEnumerable<System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>> attachments) => {
WordprocessingDocument doc = AkN.Parser.Read(docx);
WordDocument preParsed = new PreParser().Parse(doc);
IEnumerable<AttachmentPair1> attach2 = AkN.Parser.ConvertAttachments(attachments);
IJudgment judgment = f(doc, preParsed, meta, attach2);
return new AkN.Bundle(doc, judgment);
};
}
private static AkN.ILazyBundle ParseAnyJudgment(byte[] docx, IOutsideMetadata meta, IEnumerable<System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>> attachments) {
WordprocessingDocument doc = AkN.Parser.Read(docx);
WordDocument preParsed = new PreParser().Parse(doc);
IJudgment judgment = BestJudgment(preParsed, meta, attachments);
return new AkN.Bundle(doc, judgment);
}
private static Judgment BestJudgment(WordDocument preParsed, IOutsideMetadata meta, IEnumerable<System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>> attachments) {
IEnumerable<AttachmentPair1> attach2 = AkN.Parser.ConvertAttachments(attachments);
OptimizedParseFunction first = OptimizedEWHCParser.Parse;
List<OptimizedParseFunction> others = new List<OptimizedParseFunction>(2) {
OptimizedUKSCParser.Parse,
OptimizedUKUTParser.Parse
};
Judgment judgment1 = first(preParsed.Docx, preParsed, meta, attach2);
int score1 = Score(judgment1);
if (score1 == PerfectScore)
return judgment1;
foreach (var other in others) {
Judgment judgment2 = other(preParsed.Docx, preParsed, meta, attach2);
int score2 = Score(judgment2);
if (score2 == PerfectScore)
return judgment2;
if (score2 > score1) {
judgment1 = judgment2;
score1 = score2;
}
}
return judgment1;
}
private static AkN.ILazyBundle JudgmentOrPressSummary(byte[] docx, IOutsideMetadata meta, IEnumerable<System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>> attachments) {
WordprocessingDocument doc = AkN.Parser.Read(docx);
WordDocument preParsed = new PreParser().Parse(doc);
Judgment judgment = BestJudgment(preParsed, meta, attachments);
if (Score(judgment) == PerfectScore)
return new AkN.Bundle(doc, judgment);
PS.PressSummary ps = PS.Parser.Parse(preParsed, meta);
if (ps.InternalMetadata.DocType is not null)
return new AkN.PSBundle(doc, ps);
return new AkN.Bundle(doc, judgment);
}
private static int PerfectScore = 7;
private static int Score(Judgment judgment) {
int score = 0;
if (judgment.Header is not null && judgment.Header.Any())
score += 2;
if (judgment.InternalMetadata.ShortUriComponent is not null)
score += 1;
if (judgment.InternalMetadata.Court is not null)
score += 1;
if (judgment.InternalMetadata.Cite is not null)
score += 1;
if (judgment.InternalMetadata.Date is not null)
score += 1;
if (judgment.InternalMetadata.Name is not null)
score += 1;
return score;
}
private static AkN.ILazyBundle ParsePressSummary(byte[] docx, IOutsideMetadata meta, IEnumerable<System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>> attachments) {
WordprocessingDocument doc = AkN.Parser.Read(docx);
PS.PressSummary ps = PS.Parser.Parse(doc, meta);
return new AkN.PSBundle(doc, ps);
}
/* */
internal static string SerializeXml(XmlDocument judgment) {
using MemoryStream memStrm = new MemoryStream();
AkN.Serializer.Serialize(judgment, memStrm);
return System.Text.Encoding.UTF8.GetString(memStrm.ToArray());
}
internal static Meta ConvertInternalMetadata(UK.Gov.Legislation.Judgments.AkomaNtoso.Meta meta) {
return new Meta() {
DocumentType = meta.DocElementName,
Uri = URI.IsEmpty(meta.WorkUri) ? null : meta.WorkUri,
Court = meta.UKCourt,
Cite = meta.UKCite,
Date = meta.WorkDate,
Name = meta.WorkName,
Attachments = meta.ExternalAttachments.Select(a => new ExternalAttachment() { Name = a.ShowAs, Link = a.Href })
};
}
internal static Image ConvertImage(IImage image) {
return new Image() {
Name = image.Name,
Type = image.ContentType,
Content = image.Read()
};
}
internal static AttachmentPair ConvertAttachment(Attachment a) {
var content = a.Content;
var type1 = a.Type;
UK.Gov.Legislation.Judgments.AttachmentType type2;
if (type1 == Api.AttachmentType.Order)
type2 = UK.Gov.Legislation.Judgments.AttachmentType.Order;
else if (type1 == Api.AttachmentType.Appendix)
type2 = UK.Gov.Legislation.Judgments.AttachmentType.Appendix;
else
throw new System.Exception();
return new System.Tuple<byte[], UK.Gov.Legislation.Judgments.AttachmentType>(content, type2);
}
internal static void Log(Api.Meta meta) {
if (string.IsNullOrEmpty(meta.DocumentType))
Logger.LogWarning("The document type is null");
else
Logger.LogInformation("The document type is {}", meta.DocumentType);
if (string.IsNullOrEmpty(URI.ExtractShortURIComponent(meta.Uri)))
Logger.LogWarning("The {} uri is null", meta.DocumentType);
else
Logger.LogInformation("The {} uri is {}", meta.DocumentType, meta.Uri);
if (meta.Court is null)
Logger.LogWarning("The court is null");
else
Logger.LogInformation("The court is {}", meta.Court);
if (meta.Cite is null)
Logger.LogWarning("The case citation is null");
else
Logger.LogInformation("The case citation is {}", meta.Cite);
if (meta.Date is null)
Logger.LogWarning("The {} date is null", meta.DocumentType);
else
Logger.LogInformation("The {} date is {}", meta.DocumentType, meta.Date);
if (meta.Name is null)
Logger.LogWarning("The {} name is null", meta.DocumentType);
else
Logger.LogInformation("The {} name is {}", meta.DocumentType, meta.Name);
}
}
}