golang map真有那么随机吗？——map遍历研究

本文介绍: 在随机选取map中元素时，本想用map遍历的方式来返回，但是却并没有通过测试。那么难道map的遍历并不是那么的随机吗？以下代码参考go1.18hiter是map遍历的结构，主要记录了当前遍历的元素、开始位置等来完成整个遍历过程mapiterinit为开始遍历的方法，主要是确定初始遍历的位置从上面的代码分析我们便可以看出随机选取的元素并不是真的随机，溢出桶并不包含在随机选择的范围里面在具体的遍历过程，存在以下疑问通过以上代码分析，可以看出：在扩容时遍历，如果当前遍历的桶已经迁移好了，那么取新桶。

在随机选取map中元素时，本想用map遍历的方式来返回，但是却并没有通过测试。

那么难道map的遍历并不是那么的随机吗？

以下代码参考go1.18

hiter是map遍历的结构，主要记录了当前遍历的元素、开始位置等来完成整个遍历过程

// A hash iteration structure.
// If you modify hiter, also change cmd/compile/internal/reflectdata/reflect.go
// and reflect/value.go to match the layout of this structure.
type hiter struct {
  // 指向下一个遍历key的地址
	key         unsafe.Pointer // Must be in first position.  Write nil to indicate iteration end (see cmd/compile/internal/walk/range.go).
  // 指向下一个遍历value的地址
	elem        unsafe.Pointer // Must be in second position (see cmd/compile/internal/walk/range.go).
  // map类型
	t           *maptype
  // map header
	h           *hmap
  // 初始化时指向的bucket
	buckets     unsafe.Pointer // bucket ptr at hash_iter initialization time
  // 当前遍历到的bmap
	bptr        *bmap          // current bucket
	overflow    *[]*bmap       // keeps overflow buckets of hmap.buckets alive
	oldoverflow *[]*bmap       // keeps overflow buckets of hmap.oldbuckets alive
  // 开始桶
	startBucket uintptr        // bucket iteration started at
	// 桶内偏移量
  offset      uint8          // intra-bucket offset to start from during iteration (should be big enough to hold bucketCnt-1)
  // 是否从头遍历了
	wrapped     bool           // already wrapped around from end of bucket array to beginning
	B           uint8
  // 正在遍历的槽位
	i           uint8
  // 正在遍历的桶位
	bucket      uintptr
  // 用于扩容时进行检查
	checkBucket uintptr
}

mapiterinit为开始遍历的方法，主要是确定初始遍历的位置

// mapiterinit initializes the hiter struct used for ranging over maps.
// The hiter struct pointed to by 'it' is allocated on the stack
// by the compilers order pass or on the heap by reflect_mapiterinit.
// Both need to have zeroed hiter since the struct contains pointers.
func mapiterinit(t *maptype, h *hmap, it *hiter) {
	// 若map为空，则跳过遍历过程
  it.t = t
	if h == nil || h.count == 0 {
		return
	}

	if unsafe.Sizeof(hiter{})/goarch.PtrSize != 12 {
		throw("hash_iter size incorrect") // see cmd/compile/internal/reflectdata/reflect.go
	}
	it.h = h

	// grab snapshot of bucket state
  // 迭代器快照记录map桶信息
	it.B = h.B
	it.buckets = h.buckets
	if t.bucket.ptrdata == 0 {
		// Allocate the current slice and remember pointers to both current and old.
		// This preserves all relevant overflow buckets alive even if
		// the table grows and/or overflow buckets are added to the table
		// while we are iterating.
		h.createOverflow()
		it.overflow = h.extra.overflow
		it.oldoverflow = h.extra.oldoverflow
	}

	// decide where to start
  // 开始bucket选择随机数的低B位
  // 偏移量选择随机数高B位与桶数量，显然这个桶数量是不包括溢出桶的
	r := uintptr(fastrand())
	if h.B > 31-bucketCntBits {
		r += uintptr(fastrand()) << 31
	}
	it.startBucket = r & bucketMask(h.B)
	it.offset = uint8(r >> h.B & (bucketCnt - 1))

	// iterator state
  // 更新迭代器桶为初始桶
	it.bucket = it.startBucket

	// Remember we have an iterator.
	// Can run concurrently with another mapiterinit().
  // 标记可能有迭代正在使用桶和旧桶
	if old := h.flags; old&(iterator|oldIterator) != iterator|oldIterator {
		atomic.Or8(&h.flags, iterator|oldIterator)
	}

	mapiternext(it)
}

从上面的代码分析我们便可以看出随机选取的元素并不是真的随机，溢出桶并不包含在随机选择的范围里面

在具体的遍历过程，存在以下疑问

如果在扩容中，如何进行遍历？
如何保证不遗漏？
如何防止重复遍历？

func mapiternext(it *hiter) {
	h := it.h

  // 如果标记已经写入，则抛出并发迭代写入错误
	if h.flags&hashWriting != 0 {
		throw("concurrent map iteration and map write")
	}
	t := it.t
	bucket := it.bucket
	b := it.bptr
	i := it.i
	checkBucket := it.checkBucket

next:
	if b == nil {
    // 如果再次遇到开始bucket且是从头遍历的，则说明迭代结束，返回
		if bucket == it.startBucket && it.wrapped {
			// end of iteration
			it.key = nil
			it.elem = nil
			return
		}
    
    // 如果正在迁移过程中，且老桶没被迁移，采用老桶
		if h.growing() && it.B == h.B {
			// Iterator was started in the middle of a grow, and the grow isn't done yet.
			// If the bucket we're looking at hasn't been filled in yet (i.e. the old
			// bucket hasn't been evacuated) then we need to iterate through the old
			// bucket and only return the ones that will be migrated to this bucket.
			oldbucket := bucket & it.h.oldbucketmask()
			b = (*bmap)(add(h.oldbuckets, oldbucket*uintptr(t.bucketsize)))
      // bucket未迁移，记录bucket
      // checkBucket在当前map处于迁移而bucket未迁移时，为当前bucket
      // 否则为noCheck
			if !evacuated(b) {
				checkBucket = bucket
			} else {
				b = (*bmap)(add(it.buckets, bucket*uintptr(t.bucketsize)))
				checkBucket = noCheck
			}
		} else {
      // map处于未迁移，或者bucket迁移完成，采用新桶
			b = (*bmap)(add(it.buckets, bucket*uintptr(t.bucketsize)))
			checkBucket = noCheck
		}
    
    // 推进到下一桶
		bucket++
    // 遍历到最后一个桶，要绕回0桶继续遍历
		if bucket == bucketShift(it.B) {
			bucket = 0
			it.wrapped = true
		}
		i = 0
	}
  
  // 遍历桶内元素
	for ; i < bucketCnt; i++ {
    // 从offset槽开始
		offi := (i + it.offset) & (bucketCnt - 1)
    // 跳过空槽
		if isEmpty(b.tophash[offi]) || b.tophash[offi] == evacuatedEmpty {
			// TODO: emptyRest is hard to use here, as we start iterating
			// in the middle of a bucket. It's feasible, just tricky.
			continue
		}
    // 获取元素key、value
		k := add(unsafe.Pointer(b), dataOffset+uintptr(offi)*uintptr(t.keysize))
		if t.indirectkey() {
			k = *((*unsafe.Pointer)(k))
		}
		e := add(unsafe.Pointer(b), dataOffset+bucketCnt*uintptr(t.keysize)+uintptr(offi)*uintptr(t.elemsize))
    
    // 扩容迁移时过滤掉不属于当前指向新桶的旧桶元素
		if checkBucket != noCheck && !h.sameSizeGrow() {
			// Special case: iterator was started during a grow to a larger size
			// and the grow is not done yet. We're working on a bucket whose
			// oldbucket has not been evacuated yet. Or at least, it wasn't
			// evacuated when we started the bucket. So we're iterating
			// through the oldbucket, skipping any keys that will go
			// to the other new bucket (each oldbucket expands to two
			// buckets during a grow).
      // 若key是有效的
			if t.reflexivekey() || t.key.equal(k, k) {
				// If the item in the oldbucket is not destined for
				// the current new bucket in the iteration, skip it.
        // 如果旧桶中的项在迭代中不打算用于当前的新桶，则跳过它。
				hash := t.hasher(k, uintptr(h.hash0))
				if hash&bucketMask(it.B) != checkBucket {
					continue
				}
			} else {
        // 对k！=k，也就是nil之类的，判断是否属于该新桶
        // 不是，则跳过
				// Hash isn't repeatable if k != k (NaNs).  We need a
				// repeatable and randomish choice of which direction
				// to send NaNs during evacuation. We'll use the low
				// bit of tophash to decide which way NaNs go.
				// NOTE: this case is why we need two evacuate tophash
				// values, evacuatedX and evacuatedY, that differ in
				// their low bit.
				if checkBucket>>(it.B-1) != uintptr(b.tophash[offi]&1) {
					continue
				}
			}
		}
    
    // 如果当前桶未扩容迁移，或者是每次hash不一致的key，获取到key、value添加到迭代器中
		if (b.tophash[offi] != evacuatedX && b.tophash[offi] != evacuatedY) ||
			!(t.reflexivekey() || t.key.equal(k, k)) {
			// This is the golden data, we can return it.
			// OR
			// key!=key, so the entry can't be deleted or updated, so we can just return it.
			// That's lucky for us because when key!=key we can't look it up successfully.
			it.key = k
			if t.indirectelem() {
				e = *((*unsafe.Pointer)(e))
			}
			it.elem = e
		} else {
      // 数据已经迁移情况下，处理键已被删除、更新或删除并重新插入的情况，定位数据，最后添加遍历key、value
			// The hash table has grown since the iterator was started.
			// The golden data for this key is now somewhere else.
			// Check the current hash table for the data.
			// This code handles the case where the key
			// has been deleted, updated, or deleted and reinserted.
			// NOTE: we need to regrab the key as it has potentially been
			// updated to an equal() but not identical key (e.g. +0.0 vs -0.0).
			rk, re := mapaccessK(t, h, k)
			if rk == nil {
				continue // key has been deleted
			}
			it.key = rk
			it.elem = re
		}
    
    // 迭代器记录进度
		it.bucket = bucket
		if it.bptr != b { // avoid unnecessary write barrier; see issue 14921
			it.bptr = b
		}
		it.i = i + 1
		it.checkBucket = checkBucket
		return
	}
  
  // 遍历溢出桶
	b = b.overflow(t)
	i = 0
	goto next
}

通过以上代码分析，可以看出：

在扩容时遍历，
- 如果当前遍历的桶已经迁移好了，那么取新桶
- 如果仍然处于旧桶，则取旧桶。
  
  但值得注意的是要过滤掉那些不属于该新桶的旧桶元素。因为旧桶在扩容迁移时会分为两块，当前指向的新桶只属于其中之一
bucket从初始桶逐渐递增，保证正常桶都能遍历到。此外也保证了完整遍历溢出桶，直到溢出桶为空
通过记录是否从头遍历的标志和起始bucket，以及在扩容过程中过滤不属于该新桶的元素来保证不会重复遍历