Skip to content

Commit

Permalink
fix(mlcli): external groundtruth sources (#287)
Browse files Browse the repository at this point in the history
  • Loading branch information
guillaume-chervet authored Mar 24, 2023
1 parent efa18ab commit be6ef13
Show file tree
Hide file tree
Showing 9 changed files with 119 additions and 82 deletions.
18 changes: 9 additions & 9 deletions README-ML-CLI.md
Original file line number Diff line number Diff line change
Expand Up @@ -335,15 +335,15 @@ etc.
## Ml-Cli batch

You can execute several tasks in command line interface (CLI):
- [`wait_version_change`](./src/Ml.Cli/JobVersion#readme) is a task that will wait for the version obtained via the url to change for a user-defined amount of time.
- [`callapi`](./src/Ml.Cli/JobApiCall#readme) is a task which will call an online service to get jsons files describing files containing images. These json files contain a list of URLs leading to extracted images of the files containing images. The task can also download these images after generating the related json file.
- [`parallel`](./src/Ml.Cli/JobParallel#readme) and serial are used to describe the way of handling your tasks.
- [`serial`](./src/Ml.Cli/JobSerial#readme) are used to describe the way of handling your tasks.
- [`loop`](./src/Ml.Cli/JobLoop#readme) is used to execute the task indefinitely.
- [`script`](./src/Ml.Cli/JobScript#readme) will execute a user-defined script on files stored in a repository.
- [`compare`](./src/Ml.Cli/JobCompare#readme) is used to compare two sets of json files; the resulting json file can be used to see the results with the help of the server.
- [`dataset`](./src/Ml.Cli/JobDataset#readme) is used to generate a dataset file which will contain all annotations (of a same, user-specified type and configuration) made on json files with the help of Ml-Cli front.
- [`copy`](./src/Ml.Cli/JobCopy#readme) copy from a directory to another directory.
- [`wait_version_change`](./src/MlCli/JobVersion#readme) is a task that will wait for the version obtained via the url to change for a user-defined amount of time.
- [`callapi`](./src/MlCli/JobApiCall#readme) is a task which will call an online service to get jsons files describing files containing images. These json files contain a list of URLs leading to extracted images of the files containing images. The task can also download these images after generating the related json file.
- [`parallel`](./src/MlCli/JobParallel#readme) and serial are used to describe the way of handling your tasks.
- [`serial`](./src/MlCli/JobSerial#readme) are used to describe the way of handling your tasks.
- [`loop`](./src/MlCli/JobLoop#readme) is used to execute the task indefinitely.
- [`script`](./src/MlCli/JobScript#readme) will execute a user-defined script on files stored in a repository.
- [`compare`](./src/MlCli/JobCompare#readme) is used to compare two sets of json files; the resulting json file can be used to see the results with the help of the server.
- [`dataset`](./src/MlCli/JobDataset#readme) is used to generate a dataset file which will contain all annotations (of a same, user-specified type and configuration) made on json files with the help of Ml-Cli front.
- [`copy`](./src/MlCli/JobCopy#readme) copy from a directory to another directory.


### tasks-sample.json
Expand Down
2 changes: 1 addition & 1 deletion src/Ecotag/Properties/launchSettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"LocalBatch": {
"commandName": "Project",
"launchBrowser": true,
"commandLineArgs": "--tasks-path ../../demo/tasks-licenses.json --base-path ../../demo --compares-paths licenses/compares --datasets-paths licenses/datasets",
"commandLineArgs": "---tasks-path C:/Demo/integration/tasks-ged.json --base-path C:/Demo/integration",
"applicationUrl": "http://localhost:5000",
"environmentVariables": {
"ASPNETCORE_ENVIRONMENT": "Development"
Expand Down
53 changes: 28 additions & 25 deletions src/MlCli/JobApiCall/ApiCallFiles.cs
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,30 @@ public ApiCallFiles(IHttpClientFactory httpClientFactory, IFileLoader fileLoader
private void FindElementsTokens(JToken containerToken, List<FileUrls> urlsList,
Dictionary<string, JsonsList> jsonsDict)
{
if (containerToken.Type == JTokenType.Object)
foreach (var child in containerToken.Children<JProperty>())
switch (containerToken.Type)
{
case JTokenType.Object:
{
foreach (var child in containerToken.Children<JProperty>())
{
if (child.Name.Contains("url_", StringComparison.Ordinal))
urlsList.Add(new FileUrls(child.Name, new Uri(child.Value.ToString())));
else if (child.Name.Contains("input_", StringComparison.Ordinal))
jsonsDict["input"].Jsons.Add(new FileJsons(child.Name, child.Value.ToString()));
else if (child.Name.Contains("output_", StringComparison.Ordinal))
jsonsDict["output"].Jsons.Add(new FileJsons(child.Name, child.Value.ToString()));
FindElementsTokens(child.Value, urlsList, jsonsDict);
}

break;
}
case JTokenType.Array:
{
if (child.Name.Contains("url_", StringComparison.Ordinal))
urlsList.Add(new FileUrls(child.Name, new Uri(child.Value.ToString())));
else if (child.Name.Contains("input_", StringComparison.Ordinal))
jsonsDict["input"].Jsons.Add(new FileJsons(child.Name, child.Value.ToString()));
else if (child.Name.Contains("output_", StringComparison.Ordinal))
jsonsDict["output"].Jsons.Add(new FileJsons(child.Name, child.Value.ToString()));
FindElementsTokens(child.Value, urlsList, jsonsDict);
foreach (var child in containerToken.Children())
FindElementsTokens(child, urlsList, jsonsDict);
break;
}
else if (containerToken.Type == JTokenType.Array)
foreach (var child in containerToken.Children())
FindElementsTokens(child, urlsList, jsonsDict);
}
}

public async Task ApiCallFilesAsync(string fileName, string json, Callapi inputTask)
Expand Down Expand Up @@ -78,10 +88,8 @@ public async Task ApiCallFilesAsync(string fileName, string json, Callapi inputT
FindElementsTokens(node, urlsList, jsonsDict);

var tempStringsArray = inputTask.DownloadStringsMatcher.Split(",");
var stringsArray = new List<string>();
foreach (var regex in tempStringsArray) stringsArray.Add(regex.Trim());

await DownloadFilesAsync(httpClient, fileName, urlsList, jsonsDict, stringsArray.ToArray(), inputTask);
await DownloadFilesAsync(httpClient, fileName, urlsList, jsonsDict, tempStringsArray.Select(regex => regex.Trim()).ToArray(), inputTask);
}
catch (Exception ex)
{
Expand All @@ -95,17 +103,15 @@ private async Task DownloadFilesAsync(HttpClient httpClient, string fileName, Li
if (inputTask.EnabledSaveImages)
{
_fileLoader.CreateDirectory(inputTask.OutputDirectoryImages);
foreach (var imageUrl in urlsList)
if (IsStringsArrayMatch(imageUrl.Key, stringsArray.ToArray()))
await DownloadFilesAsync(httpClient, fileName, imageUrl, inputTask);
foreach (var imageUrl in urlsList.Where(imageUrl => IsStringsArrayMatch(imageUrl.Key, stringsArray.ToArray())))
await DownloadFilesAsync(httpClient, fileName, imageUrl, inputTask);
}

if (inputTask.EnabledSaveInputs)
{
_fileLoader.CreateDirectory(inputTask.OutputDirectoryInputs);
foreach (var input in jsonsDict["input"].Jsons)
if (IsStringsArrayMatch(input.Key, stringsArray.ToArray()))
await DownloadJsonAsync(input, inputTask, true);
foreach (var input in jsonsDict["input"].Jsons.Where(input => IsStringsArrayMatch(input.Key, stringsArray.ToArray())))
await DownloadJsonAsync(input, inputTask, true);
}

if (inputTask.EnabledSaveOutputs)
Expand Down Expand Up @@ -165,10 +171,7 @@ private async Task DownloadJsonAsync(FileJsons fileJson, Callapi inputTask, bool
public static bool IsStringsArrayMatch(string input, string[] stringsArray)
{
if (stringsArray == null || input == null) return false;
foreach (var element in stringsArray)
if (input.Contains(element))
return true;
return false;
return stringsArray.Any(element => input.Contains(element));
}

private interface IFileObjects
Expand Down
8 changes: 7 additions & 1 deletion src/MlCli/JobApiCall/Callapi.cs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ namespace AxaGuilDEv.MlCli.JobApiCall;

public class Callapi : IInputTask
{

public Callapi(string type, string id, bool enabled, string fileDirectory, string outputDirectoryJsons,
string outputDirectoryImages, string outputDirectoryInputs, string outputDirectoryOutputs,
string frontDefaultStringsMatcher, string downloadStringsMatcher, bool enabledSaveImages,
Expand All @@ -16,9 +17,12 @@ public Callapi(string type, string id, bool enabled, string fileDirectory, strin
bool isSaveResultOnError = true,
int? stopAfterNumberFiles = null,
int? chunkByNumberPart = null,
int? chunkIndex = null
int? chunkIndex = null,
bool isDefaultTargetFileMode = true

)
{
IsDefaultTargetFileMode = isDefaultTargetFileMode;
StopAfterNumberFiles = stopAfterNumberFiles;
Type = type;
Id = id;
Expand Down Expand Up @@ -71,4 +75,6 @@ public Callapi(string type, string id, bool enabled, string fileDirectory, strin
public string Id { get; }
public string Type { get; }
public bool Enabled { get; }

public bool IsDefaultTargetFileMode { get; }
}
4 changes: 3 additions & 1 deletion src/MlCli/JobApiCall/Initializer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public static Callapi CreateTask(JObject jObject, string type, bool tokenEnabled
jObject.ContainsKey("chunkIndex") ? (int?)jObject.Property("chunkIndex") : null;

var numberParallel = jObject.ContainsKey("numberParallel") ? (int)jObject.Property("numberParallel") : 1;
var defaultTargetFileMode = !jObject.ContainsKey("isDefaultTargetFileMode") || (bool)jObject.Property("isDefaultTargetFileMode");
var waitTimeMsBetweenRequest = jObject.ContainsKey("waitTimeMsBetweenRequest")
? (int)jObject.Property("waitTimeMsBetweenRequest")
: 0;
Expand Down Expand Up @@ -128,7 +129,8 @@ public static Callapi CreateTask(JObject jObject, string type, bool tokenEnabled
isSaveResultOnError,
stopAfterNumberFiles,
chunkByNumberPart,
chunkIndex
chunkIndex,
defaultTargetFileMode
);
}

Expand Down
41 changes: 28 additions & 13 deletions src/MlCli/JobApiCall/Job.cs
Original file line number Diff line number Diff line change
Expand Up @@ -121,16 +121,17 @@ public async Task ApiCallAsync(Callapi inputTask)
}
}

private async Task<string> PlayDataAsync(HttpClient httpClient, Callapi inputTask, string currentFile,
private async Task<string> PlayDataAsync(HttpClient httpClient, Callapi inputTask, string currentFilePath,
string extension, string outputDirectory)
{
if (Path.GetExtension(currentFile) == ".json") return string.Empty;
if (Path.GetExtension(currentFilePath) == ".json") return string.Empty;

var fileName = Path.GetFileName(currentFile);
var jsonFileName = $"{fileName.Replace(".", "_")}{extension}";

var jsonFileName = GetTargetFileName(inputTask.IsDefaultTargetFileMode, currentFilePath, extension);
var targetFileName = Path.Combine(outputDirectory, jsonFileName);
try
{
var fileName = Path.GetFileName(currentFilePath);
if (_fileLoader.FileExists(targetFileName))
{
_logger.LogWarning($"Task Id: {inputTask.Id} - Already processed file {fileName}");
Expand All @@ -145,7 +146,7 @@ private async Task<string> PlayDataAsync(HttpClient httpClient, Callapi inputTas
{
try
{
httpResult = await CallHttpAsync(httpClient, inputTask, currentFile, jsonFileName, i);
httpResult = await CallHttpAsync(httpClient, inputTask, currentFilePath, jsonFileName, i);
if (httpResult.StatusCode < 500) break;
}
catch (Exception e)
Expand Down Expand Up @@ -173,14 +174,13 @@ private async Task<string> PlayDataAsync(HttpClient httpClient, Callapi inputTas
if (httpResult == null)
throw new ApplicationException("httpResult is null");

if (httpResult.StatusCode < 500 || (inputTask.IsSaveResultOnError && httpResult.StatusCode >= 500))
{
var json = JsonConvert.SerializeObject(httpResult, Formatting.Indented);
await _fileLoader.WriteAllTextInFileAsync(targetFileName,
json);
if (inputTask.EnabledSaveImages || inputTask.EnabledSaveInputs || inputTask.EnabledSaveOutputs)
await _callFiles.ApiCallFilesAsync(fileName, json, inputTask);
}
if (httpResult.StatusCode >= 500 && (!inputTask.IsSaveResultOnError || httpResult.StatusCode < 500))
return httpResult.StatusCode < 500 ? "OK" : "KO";
var json = JsonConvert.SerializeObject(httpResult, Formatting.Indented);
await _fileLoader.WriteAllTextInFileAsync(targetFileName,
json);
if (inputTask.EnabledSaveImages || inputTask.EnabledSaveInputs || inputTask.EnabledSaveOutputs)
await _callFiles.ApiCallFilesAsync(fileName, json, inputTask);

return httpResult.StatusCode < 500 ? "OK" : "KO";
}
Expand All @@ -191,6 +191,21 @@ await _fileLoader.WriteAllTextInFileAsync(targetFileName,
}
}

public static string GetTargetFileName(bool isDefaultTargetFileMode, string currentFilePath, string extension)
{
var fileName = Path.GetFileName(currentFilePath);
if (isDefaultTargetFileMode)
{
var jsonFileName = $"{fileName.Replace(".", "_")}{extension}";
return jsonFileName;
}
else
{
var jsonFileName = fileName.Replace(Path.GetExtension(currentFilePath), "") + extension;
return jsonFileName;
}
}

private async Task<Program.HttpResult> CallHttpAsync(HttpClient httpClient, Callapi inputTask, string file,
string targetFileName, int tryNumber)
{
Expand Down
7 changes: 6 additions & 1 deletion src/MlCli/JobApiCall/README.MD
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ tasks.json
"delayOn500":5000,
"isSaveResultOnError": true,
"stopAfterNumberFiles":null,
"isDefaultTargetFileMode": true,
}
]
```
Expand Down Expand Up @@ -123,4 +124,8 @@ tasks.json
- **isSaveResultOnError**: if request superior or equal to 500 then result is not saved
- default: true
- **stopAfterNumberFiles**: after "stopAfterNumberFiles" files the JobApiCall stop
- default: null
- default: null
- **isDefaultTargetFileMode**: Pattern to generate file name in outputDirectoryJsons
- default: true
- true: generate file from {000E90E8-5C65-413A-8911-CE848FE245B4}.4dd580d307bf5820e0a3b36211055fc7.pdf to {000E90E8-5C65-413A-8911-CE848FE245B4}_4dd580d307bf5820e0a3b36211055fc7_pdf.json
- false: generate file from {000E90E8-5C65-413A-8911-CE848FE245B4}.4dd580d307bf5820e0a3b36211055fc7.pdf to {000E90E8-5C65-413A-8911-CE848FE245B4}.4dd580d307bf5820e0a3b36211055fc7.json
57 changes: 26 additions & 31 deletions src/MlCli/JobCompare/Job.cs
Original file line number Diff line number Diff line change
Expand Up @@ -28,41 +28,16 @@ public async Task CompareAsync(CompareTask inputTask)
? fileLeftPath
: Path.Combine(inputTask.LeftDirectory, fileLeftPath);
_logger.LogInformation($"Task Id: {inputTask.Id} - Compare {fileName}");
var jsonLeft = await _fileLoader.ReadAllTextInFileAsync(filePath);
Program.HttpResult left;
try
{
left = JsonConvert.DeserializeObject<Program.HttpResult>(jsonLeft);
}
catch (JsonException)
{
left = CreateHttpResult(jsonLeft, fileName, filePath);
}

var left = await FormatToHttpResult(filePath, fileName, filePath);
var fileRightPath = Path.Combine(inputTask.RightDirectory, fileName);
if (!_fileLoader.FileExists(fileRightPath))
{
if (inputTask.OnFileNotFound == "warning")
{
_logger.LogWarning(
$"Task Id: {inputTask.Id} - File not found for comparison in right path: {fileRightPath}");
continue;
}

throw new FileNotFoundException(fileRightPath);
}

var jsonRight = await _fileLoader.ReadAllTextInFileAsync(fileRightPath);
Program.HttpResult right;
try
{
right = JsonConvert.DeserializeObject<Program.HttpResult>(jsonRight);
}
catch (JsonException)
{
right = CreateHttpResult(jsonRight, fileName, fileRightPath);
if (inputTask.OnFileNotFound != "warning") throw new FileNotFoundException(fileRightPath);
_logger.LogWarning(
$"Task Id: {inputTask.Id} - File not found for comparison in right path: {fileRightPath}");
continue;
}

var right = await FormatToHttpResult(fileRightPath, fileName, filePath);
compareResults.Add(new CompareResult
{
FileName = fileName,
Expand All @@ -81,6 +56,26 @@ await _fileLoader.WriteAllTextInFileAsync(
Formatting.Indented));
}

private async Task<Program.HttpResult> FormatToHttpResult(string fileRightPath, string fileName, string filePath)
{
var jsonRight = await _fileLoader.ReadAllTextInFileAsync(fileRightPath);
Program.HttpResult right;
try
{
right = JsonConvert.DeserializeObject<Program.HttpResult>(jsonRight);
if (right.Body == null)
{
right = CreateHttpResult(jsonRight, fileName, filePath);
}
}
catch (JsonException)
{
right = CreateHttpResult(jsonRight, fileName, fileRightPath);
}

return right;
}

private Program.HttpResult CreateHttpResult(string body, string fileName, string fileDirectory)
{
return new Program.HttpResult
Expand Down
11 changes: 11 additions & 0 deletions tests/MlCli.Tests/JobsTests/ApiCallUnitTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -268,5 +268,16 @@ public void ApiCallShouldInitialize()
1,
1);
Assert.Equal(JsonConvert.SerializeObject(expectedCallapiResult), JsonConvert.SerializeObject(callapiResult));
}

[Theory]
[InlineData(false, @"{00087046-E26C-4183-826D-E6F074C0ED08}.9f7f4e92aabcbe45ca7a34212a05865f.json")]
[InlineData(true, @"{00087046-E26C-4183-826D-E6F074C0ED08}_9f7f4e92aabcbe45ca7a34212a05865f_pdf.json")]
public void GetTargetFileNameShouldReturnCorrectFilename(bool isDefaultTargetFileMode, string expectedFilename)
{
var currentFilePath = @"C:\Demo\{00087046-E26C-4183-826D-E6F074C0ED08}.9f7f4e92aabcbe45ca7a34212a05865f.pdf";
var filename = TaskApiCall.GetTargetFileName(isDefaultTargetFileMode, currentFilePath, ".json");
Assert.Equal(expectedFilename, filename);
}

}

0 comments on commit be6ef13

Please sign in to comment.