Embodied agents equipped with GPT as their brains have exhibited extraordinary decision-making and generalization abilities across various tasks. However, existing zero-shot agents for vision-and-language navigation (VLN) only prompt GPT-4 to select potential locations within localized environments, without constructing an effective "global-view" for the agent to understand the overall environment. In this work, we present a novel map-guided GPT-based agent, dubbed MapGPT, which introduces an online linguistic-formed map to encourage global exploration. Specifically, we build an online map and incorporate it into the prompts that include node information and topological relationships, to help GPT understand the spatial environment. Benefiting from this design, we further propose an adaptive planning mechanism to assist the agent in performing multi-step path planning based on a map, systematically exploring multiple candidate nodes or sub-goals step by step. Extensive experiments demonstrate that our MapGPT is applicable to both GPT-4 and GPT-4V, achieving state-of-the-art zero-shot performance on R2R and REVERIE simultaneously (~10% and ~12% improvements in SR), and showcasing the newly emergent global thinking and path planning abilities of the GPT.
A comparison of the thinking process of the GPT agent without and with topological maps. Given only a local action space, the agent may explore aimlessly, especially when navigation errors have already occurred. Incorporating topological maps enables the agent to understand spatial structures and engage in global exploration and path planning.
Our basic system consists of two types of prompts, namely task description and fundamental inputs. We introduce a map-guided prompting method that builds an online-constructed topological map into prompts, activating the agent's global exploration. We further propose an adaptive mechanism to perform multi-step path planning on this map, systematically exploring candidate nodes or sub-goals. Note that vision models are optional, and viewpoint information can be represented using either the image or textual description of the observations.
@inproceedings{chen2024mapgpt,
title={MapGPT: Map-Guided Prompting with Adaptive Path Planning for Vision-and-Language Navigation},
author={Chen, Jiaqi and Lin, Bingqian and Xu, Ran and Chai, Zhenhua and Liang, Xiaodan and Wong, Kwan-Yee~K.},
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics",
year={2024}
}
AO-Planner
@inproceedings{chen2024affordances,
title={Affordances-Oriented Planning using Foundation Models for Continuous Vision-Language Navigation},
author={Chen, Jiaqi and Lin, Bingqian and Liu, Xinmin and Ma, Lin and Liang, Xiaodan and Wong, Kwan-Yee~K.},
booktitle = "Proceedings of the AAAI Conference on Artificial Intelligence",
year={2025}
}
NavCoT
@article{lin2024navcot,
title={NavCoT: Boosting LLM-Based Vision-and-Language Navigation via Learning Disentangled Reasoning},
author={Lin, Bingqian and Nie, Yunshuang and Wei, Ziming and Chen, Jiaqi and Ma, Shikui and Han, Jianhua and Xu, Hang and Chang, Xiaojun and Liang, Xiaodan},
journal={arXiv preprint arXiv:2403.07376},
year={2024}
}