-
Notifications
You must be signed in to change notification settings - Fork 754
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* [Update] Add MAERec configs * [Update] Add MAERec configs * [Update] Add Union14M data configs * [Update] Update docs for MAERec * [Update] Fix lint * [Update] Update MAERec configs * [Update] Update MAERec
- Loading branch information
1 parent
e50c5fd
commit 8b429ab
Showing
18 changed files
with
1,008 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
union14m_root = 'data/Union14M-L/' | ||
union14m_benchmark_root = 'data/Union14M-L/Union14M-Benchmarks' | ||
|
||
union14m_benchmark_artistic = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/artistic'), | ||
ann_file=f'{union14m_benchmark_root}/artistic/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_contextless = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/contextless'), | ||
ann_file=f'{union14m_benchmark_root}/contextless/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_curve = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/curve'), | ||
ann_file=f'{union14m_benchmark_root}/curve/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_incomplete = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/incomplete'), | ||
ann_file=f'{union14m_benchmark_root}/incomplete/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_incomplete_ori = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/incomplete_ori'), | ||
ann_file=f'{union14m_benchmark_root}/incomplete_ori/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_multi_oriented = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/multi_oriented'), | ||
ann_file=f'{union14m_benchmark_root}/multi_oriented/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_multi_words = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/multi_words'), | ||
ann_file=f'{union14m_benchmark_root}/multi_words/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_salient = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_benchmark_root}/salient'), | ||
ann_file=f'{union14m_benchmark_root}/salient/annotation.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_benchmark_general = dict( | ||
type='OCRDataset', | ||
data_prefix=dict(img_path=f'{union14m_root}/'), | ||
ann_file=f'{union14m_benchmark_root}/general/annotation.json', | ||
test_mode=True, | ||
pipeline=None) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
union14m_data_root = 'data/Union14M-L/' | ||
|
||
union14m_challenging = dict( | ||
type='OCRDataset', | ||
data_root=union14m_data_root, | ||
ann_file='train_annos/mmocr1.0/train_challenging.json', | ||
test_mode=True, | ||
pipeline=None) | ||
|
||
union14m_hard = dict( | ||
type='OCRDataset', | ||
data_root=union14m_data_root, | ||
ann_file='train_annos/mmocr1.0/train_hard.json', | ||
pipeline=None) | ||
|
||
union14m_medium = dict( | ||
type='OCRDataset', | ||
data_root=union14m_data_root, | ||
ann_file='train_annos/mmocr1.0/train_medium.json', | ||
pipeline=None) | ||
|
||
union14m_normal = dict( | ||
type='OCRDataset', | ||
data_root=union14m_data_root, | ||
ann_file='train_annos/mmocr1.0/train_normal.json', | ||
pipeline=None) | ||
|
||
union14m_easy = dict( | ||
type='OCRDataset', | ||
data_root=union14m_data_root, | ||
ann_file='train_annos/mmocr1.0/train_easy.json', | ||
pipeline=None) | ||
|
||
union14m_val = dict( | ||
type='OCRDataset', | ||
data_root=union14m_data_root, | ||
ann_file='train_annos/mmocr1.0/val_annos.json', | ||
pipeline=None) |
21 changes: 21 additions & 0 deletions
21
configs/textrecog/_base_/schedules/schedule_adamw_cos_10e.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# optimizer | ||
optim_wrapper = dict( | ||
type='OptimWrapper', | ||
optimizer=dict( | ||
type='AdamW', | ||
lr=4e-4, | ||
betas=(0.9, 0.999), | ||
eps=1e-08, | ||
weight_decay=0.01)) | ||
train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=10, val_interval=1) | ||
val_cfg = dict(type='ValLoop') | ||
test_cfg = dict(type='TestLoop') | ||
|
||
# learning policy | ||
param_scheduler = [ | ||
dict( | ||
type='CosineAnnealingLR', | ||
T_max=10, | ||
eta_min=4e-6, | ||
convert_to_iter_based=True) | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
# MAERec | ||
|
||
> [Revisiting Scene Text Recognition: A Data Perspective](https://arxiv.org/abs/2307.08723) | ||
<!-- [ALGORITHM] --> | ||
|
||
## Abstract | ||
|
||
This paper aims to re-assess scene text recognition (STR) from a data-oriented perspective. We begin by revisiting the six commonly used benchmarks in STR and observe a trend of performance saturation, whereby only 2.91% of the benchmark images cannot be accurately recognized by an ensemble of 13 representative models. While these results are impressive and suggest that STR could be considered solved, however, we argue that this is primarily due to the less challenging nature of the common benchmarks, thus concealing the underlying issues that STR faces. To this end, we consolidate a large-scale real STR dataset, namely Union14M, which comprises 4 million labeled images and 10 million unlabeled images, to assess the performance of STR models in more complex real-world scenarios. Our experiments demonstrate that the 13 models can only achieve an average accuracy of 66.53% on the 4 million labeled images, indicating that STR still faces numerous challenges in the real world. By analyzing the error patterns of the 13 models, we identify seven open challenges in STR and develop a challenge-driven benchmark consisting of eight distinct subsets to facilitate further progress in the field. Our exploration demonstrates that STR is far from being solved and leveraging data may be a promising solution. In this regard, we find that utilizing the 10 million unlabeled images through self-supervised pre-training can significantly improve the robustness of STR model in real-world scenarios and leads to state-of-the-art performance. | ||
|
||
<div align=center> | ||
<img src="https://github.com/open-mmlab/mmocr/assets/65173622/708dd6b2-b915-4d6f-b0e5-78051791dd53"> | ||
</div> | ||
|
||
## Dataset | ||
|
||
### Train Dataset | ||
|
||
| trainset | instance_num | repeat_num | source | | ||
| :--------------------------------------------------------------: | :----------: | :--------: | :----: | | ||
| [Union14M](https://github.com/Mountchicken/Union14M#34-download) | 3230742 | 1 | real | | ||
|
||
### Test Dataset | ||
|
||
- On six common benchmarks | ||
|
||
| testset | instance_num | type | | ||
| :-----: | :----------: | :-------: | | ||
| IIIT5K | 3000 | regular | | ||
| SVT | 647 | regular | | ||
| IC13 | 1015 | regular | | ||
| IC15 | 2077 | irregular | | ||
| SVTP | 645 | irregular | | ||
| CT80 | 288 | irregular | | ||
|
||
- On Union14M-Benchmark | ||
|
||
| testset | instance_num | type | | ||
| :------------: | :----------: | :------------------: | | ||
| Artistic | 900 | Unsolved Challenge | | ||
| Curve | 2426 | Unsolved Challenge | | ||
| Multi-Oriented | 1369 | Unsolved Challenge | | ||
| Contextless | 779 | Additional Challenge | | ||
| Multi-Words | 829 | Additional Challenge | | ||
| Salient | 1585 | Additional Challenge | | ||
| Incomplete | 1495 | Additional Challenge | | ||
| General | 400,000 | - | | ||
|
||
## Results and Models | ||
|
||
- Evaluated on six common benchmarks | ||
|
||
| Methods | Backbone | | Regular Text | | | | Irregular Text | | download | | ||
| :---------------------------------------------: | :----------------------------------------------: | :----: | :----------: | :-------: | :-: | :-------: | :------------: | :--: | :----------------------------------------------: | | ||
| | | IIIT5K | SVT | IC13-1015 | | IC15-2077 | SVTP | CT80 | | | ||
| [MAERec-S](configs/textrecog/maerec/maerec_s_union14m.py) | [ViT-Small (Pretrained on Union14M-U)](https://github.com/Mountchicken/Union14M#51-pre-training) | 98.0 | 97.6 | 96.8 | | 87.1 | 93.2 | 97.9 | [model](https://download.openmmlab.com/mmocr/textrecog/mae/mae_union14m/maerec_s_union14m-a9a157e5.pth) | | ||
| [MAERec-B](configs/textrecog/maerec/maerec_b_union14m.py) | [ViT-Base (Pretrained on Union14M-U)](https://github.com/Mountchicken/Union14M#51-pre-training) | 98.5 | 98.1 | 97.8 | | 89.5 | 94.4 | 98.6 | [model](https://download.openmmlab.com/mmocr/textrecog/mae/mae_union14m/maerec_b_union14m-4b98d1b4.pth) | | ||
|
||
- Evaluated on Union14M-Benchmark | ||
|
||
| Methods | Backbone | | Unsolved Challenges | | | | | Additional Challenges | | General | download | | ||
| ----------------------------------- | ------------------------------------- | ----- | ------------------- | -------- | ----------- | --- | ------- | --------------------- | ---------- | ------- | ------------------------------------- | | ||
| | | Curve | Multi-Oriented | Artistic | Contextless | | Salient | Multi-Words | Incomplete | General | | | ||
| [MAERec-S](configs/textrecog/maerec/maerec_s_union14m.py) | [ViT-Small (Pretrained on Union14M-U)](https://github.com/Mountchicken/Union14M#51-pre-training) | 81.4 | 71.4 | 72.0 | 82.0 | | 78.5 | 82.4 | 2.7 | 82.5 | [model](https://download.openmmlab.com/mmocr/textrecog/mae/mae_union14m/maerec_s_union14m-a9a157e5.pth) | | ||
| [MAERec-B](configs/textrecog/maerec/maerec_b_union14m.py) | [ViT-Base (Pretrained on Union14M-U)](https://github.com/Mountchicken/Union14M#51-pre-training) | 88.8 | 83.9 | 80.0 | 85.5 | | 84.9 | 87.5 | 2.6 | 85.8 | [model](https://download.openmmlab.com/mmocr/textrecog/mae/mae_union14m/maerec_b_union14m-4b98d1b4.pth) | | ||
|
||
- **To train with MAERec, you need to download pretrained ViT weight and load it in the config file. Check [here](https://github.com/Mountchicken/Union14M/blob/main/docs/finetune.md) for instructions** | ||
|
||
## Citation | ||
|
||
```bibtex | ||
@misc{jiang2023revisiting, | ||
title={Revisiting Scene Text Recognition: A Data Perspective}, | ||
author={Qing Jiang and Jiapeng Wang and Dezhi Peng and Chongyu Liu and Lianwen Jin}, | ||
year={2023}, | ||
eprint={2307.08723}, | ||
archivePrefix={arXiv}, | ||
primaryClass={cs.CV} | ||
} | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
dictionary = dict( | ||
type='Dictionary', | ||
dict_file= # noqa | ||
'{{ fileDirname }}/../../../dicts/english_digits_symbols_space.txt', | ||
with_padding=True, | ||
with_unknown=True, | ||
same_start_end=True, | ||
with_start=True, | ||
with_end=True) | ||
|
||
model = dict( | ||
type='MAERec', | ||
backbone=dict( | ||
type='VisionTransformer', | ||
img_size=(32, 128), | ||
patch_size=(4, 4), | ||
embed_dim=384, | ||
depth=12, | ||
num_heads=6, | ||
mlp_ratio=4.0, | ||
qkv_bias=True, | ||
pretrained=None), | ||
decoder=dict( | ||
type='MAERecDecoder', | ||
n_layers=6, | ||
d_embedding=384, | ||
n_head=8, | ||
d_model=384, | ||
d_inner=384 * 4, | ||
d_k=48, | ||
d_v=48, | ||
postprocessor=dict(type='AttentionPostprocessor'), | ||
module_loss=dict( | ||
type='CEModuleLoss', reduction='mean', ignore_first_char=True), | ||
max_seq_len=48, | ||
dictionary=dictionary), | ||
data_preprocessor=dict( | ||
type='TextRecogDataPreprocessor', | ||
mean=[123.675, 116.28, 103.53], | ||
std=[58.395, 57.12, 57.375])) | ||
|
||
train_pipeline = [ | ||
dict(type='LoadImageFromFile', ignore_empty=True, min_size=0), | ||
dict(type='LoadOCRAnnotations', with_text=True), | ||
dict(type='Resize', scale=(128, 32)), | ||
dict( | ||
type='RandomApply', | ||
prob=0.5, | ||
transforms=[ | ||
dict( | ||
type='RandomChoice', | ||
transforms=[ | ||
dict( | ||
type='RandomRotate', | ||
max_angle=15, | ||
), | ||
dict( | ||
type='TorchVisionWrapper', | ||
op='RandomAffine', | ||
degrees=15, | ||
translate=(0.3, 0.3), | ||
scale=(0.5, 2.), | ||
shear=(-45, 45), | ||
), | ||
dict( | ||
type='TorchVisionWrapper', | ||
op='RandomPerspective', | ||
distortion_scale=0.5, | ||
p=1, | ||
), | ||
]) | ||
], | ||
), | ||
dict( | ||
type='RandomApply', | ||
prob=0.25, | ||
transforms=[ | ||
dict(type='PyramidRescale'), | ||
dict( | ||
type='mmdet.Albu', | ||
transforms=[ | ||
dict(type='GaussNoise', var_limit=(20, 20), p=0.5), | ||
dict(type='MotionBlur', blur_limit=7, p=0.5), | ||
]), | ||
]), | ||
dict( | ||
type='RandomApply', | ||
prob=0.25, | ||
transforms=[ | ||
dict( | ||
type='TorchVisionWrapper', | ||
op='ColorJitter', | ||
brightness=0.5, | ||
saturation=0.5, | ||
contrast=0.5, | ||
hue=0.1), | ||
]), | ||
dict( | ||
type='PackTextRecogInputs', | ||
meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio')) | ||
] | ||
|
||
test_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict(type='Resize', scale=(128, 32)), | ||
# add loading annotation after ``Resize`` because ground truth | ||
# does not need to do resize data transform | ||
dict(type='LoadOCRAnnotations', with_text=True), | ||
dict( | ||
type='PackTextRecogInputs', | ||
meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio')) | ||
] | ||
|
||
tta_pipeline = [ | ||
dict(type='LoadImageFromFile'), | ||
dict( | ||
type='TestTimeAug', | ||
transforms=[ | ||
[ | ||
dict( | ||
type='ConditionApply', | ||
true_transforms=[ | ||
dict( | ||
type='ImgAugWrapper', | ||
args=[dict(cls='Rot90', k=0, keep_size=False)]) | ||
], | ||
condition="results['img_shape'][1]<results['img_shape'][0]" | ||
), | ||
dict( | ||
type='ConditionApply', | ||
true_transforms=[ | ||
dict( | ||
type='ImgAugWrapper', | ||
args=[dict(cls='Rot90', k=1, keep_size=False)]) | ||
], | ||
condition="results['img_shape'][1]<results['img_shape'][0]" | ||
), | ||
dict( | ||
type='ConditionApply', | ||
true_transforms=[ | ||
dict( | ||
type='ImgAugWrapper', | ||
args=[dict(cls='Rot90', k=3, keep_size=False)]) | ||
], | ||
condition="results['img_shape'][1]<results['img_shape'][0]" | ||
), | ||
], | ||
[dict(type='Resize', scale=(128, 32))], | ||
# add loading annotation after ``Resize`` because ground truth | ||
# does not need to do resize data transform | ||
[dict(type='LoadOCRAnnotations', with_text=True)], | ||
[ | ||
dict( | ||
type='PackTextRecogInputs', | ||
meta_keys=('img_path', 'ori_shape', 'img_shape', | ||
'valid_ratio')) | ||
] | ||
]) | ||
] |
Oops, something went wrong.