Weitere ähnliche Inhalte Ähnlich wie Php7 hashtable (20) Kürzlich hochgeladen (20) Php7 hashtable2. PHP7 Array Featrue
PHP7 HashTable Struct
HashTable Collisions
Translation Table
PHP7 HashTable Operation
Example, Run php-src By GDB
Packed HashTables
Empty HashTable
Application Scenarios
PHP5 Hashtable
Redis Hashtable
Agenda
3. We already know about php array
• Array 为 HashTable实现
• map,可字符数字做索引建
• Foreach 快于 for
• Foreach 顺序为插入顺序
• Count($arr) 快
• In_array 慢
• 理想情况Find 时间复杂度为 O(1)
• PHP有current,prev,next,each,end,reset等指针函数
5. /* ht struct */
struct Bucket {
zval val;
zend_ulong h;
zend_string *key;
} Bucket;
struct HashTable {
zend_refcounted_h gc;
union {
struct { ZEND_ENDIAN_LOHI_4( /*... ellipsis ...*/) } v;
uint32_t flags;
} u;
uint32_t nTableMask; // -nTableSize, -16; (uint)-16==4294967280
Bucket *arData; // array items,
uint32_t nNumUsed; // next slot available in arData
uint32_t nNumOfElements; // total num of busy elements in arData
uint32_t nTableSize; // table size, always a power of two, min:8
uint32_t nInternalPointer; // Used for iteration
zend_long nNextFreeElement; // next integer-based key available
dtor_func_t pDestructor; // data destructor
};
6. struct zval {
zend_value value; /* value */
union {
struct {
ZEND_ENDIAN_LOHI_4(
zend_uchar type, /* active type */
zend_uchar type_flags,
zend_uchar const_flags,
zend_uchar reserved) /*call info for EX(This*/
} v;
uint32_t type_info;
} u1;
union {
uint32_t var_flags;
uint32_t next; /* hash collision chain */
uint32_t cache_slot; /* literal cache slot */
uint32_t lineno; /* line number (for ast nodes) */
uint32_t num_args; /* arguments number for EX(This) */
uint32_t fe_pos; /* foreach position */
uint32_t fe_iter_idx; /* foreach iterator index */
} u2;
};
union zend_value {
zend_long lval;
double dval;
zend_refcounted *counted;
zend_string *str;
zend_array *arr;
zend_object *obj;
zend_resource *res;
zend_reference *ref;
zend_ast_ref *ast;
zval *zv;
void *ptr;
zend_class_entry *ce;
zend_function *func;
struct {
uint32_t w1;
uint32_t w2;
} ww;
};
9. Hash layout
#define HT_HASH_SIZE(nTableMask) (((size_t)(uint32_t)-(int32_t)(nTableMask)) *
sizeof(uint32_t))
#define HT_DATA_SIZE(nTableSize) ((size_t)(nTableSize) * sizeof(Bucket))
#define HT_SIZE_EX(nTableSize, nTableMask) (HT_DATA_SIZE((nTableSize)) +
HT_HASH_SIZE((nTableMask)))
#define HT_SIZE(ht) HT_SIZE_EX((ht)->nTableSize, (ht)->nTableMask)
Bucket *arData;
arData = emalloc(HT_SIZE(ht)); /* now alloc this */
10. panda.tv tone shop … Bucket
…
val value …
… … … u1
-1 -1 1 u2.next
92236014990
29192316
0 65536 … h h …
0x… 0x… 0x… … *key *key nTableSize-1
0 1 2 3 4i idx=5 idx=6 7
gc …
u …
nTableMask -8
*arData 0x…
nNumUsed 3
nNumOfElements 3
nTableSize 8
nInternalPointer 0
nNextFreeElement 65537
pDestructor 0x…
…
922360…
7
company
gc
h
len
val[1]
2 0
-8 -7 -6 -5 -4 -3 -2 -1
$arr = [
'company'=>'panda.tv',
0=>’tone',
65536=>'shop'
];
0x0
Translation table bucket
PHP7 HashTable :
nIndex idx
12. HashTable Init
ZEND_API void ZEND_FASTCALL _zend_hash_init(HashTable *ht, uint32_t nSize, dtor_func_t
pDestructor, zend_bool persistent ZEND_FILE_LINE_DC)
{
GC_REFCOUNT(ht) = 1;
GC_TYPE_INFO(ht) = IS_ARRAY;
ht->u.flags = (persistent ? HASH_FLAG_PERSISTENT : 0) | HASH_FLAG_APPLY_PROTECTION |
HASH_FLAG_STATIC_KEYS;
ht->nTableSize = zend_hash_check_size(nSize);
ht->nTableMask = HT_MIN_MASK; // ((uint32_t) -2)
HT_SET_DATA_ADDR(ht, &uninitialized_bucket); //HT_SET_DATA_ADDR(ht, ptr) do { (ht)-
>arData = (Bucket*)(((char*)(ptr)) + HT_HASH_SIZE((ht)->nTableMask)); } while (0)
ht->nNumUsed = 0;
ht->nNumOfElements = 0;
ht->nInternalPointer = HT_INVALID_IDX;
ht->nNextFreeElement = 0;
ht->pDestructor = pDestructor;
}
13. Translation Table Demo
/* @auth xuruiliang@panda.tv, 在此感谢许老板帮写的demo*/
#include <iostream>
#include <cstdlib>
#include <cassert>
using namespace std;
struct P {
int x, y;
};
const int P_SIZE = 10;
int main()
{
struct P p1 = (struct P){.x = 100, .y = 101};
P *p = (P *)malloc(P_SIZE* (sizeof(int) + sizeof(P)));
assert(p != NULL);
((int *)p)[0] = 10;
p = (P *)((int *)p + P_SIZE);
p[3] = p1;
}
14. static zend_always_inline zval *_zend_hash_add_or_update_i(HashTable *ht, zend_string *key,
zval *pData, uint32_t flag ZEND_FILE_LINE_DC)
{
ZEND_HASH_IF_FULL_DO_RESIZE(ht); //if ((ht)->nNumUsed >= (ht)->nTableSize)
{ zend_hash_do_resize(ht); }
idx = ht->nNumUsed++; /* take the next avalaible slot number */
ht->nNumOfElements++; /* increment number of elements */
/* ... */
p = ht->arData + idx; /* Get the bucket in that slot from arData */
p->key = key; /* Affect it the key we want to insert at */
/* ... */
p->h = h = ZSTR_H(key); /* save the hash of the current key into the bucket */
ZVAL_COPY_VALUE(&p->val, pData); /* Copy the value into the bucket's value : add */
nIndex = h | ht->nTableMask; /* Get the translation table index */
// p->val.u2.next =
Z_NEXT(p->val) = HT_HASH(ht, nIndex); /* Put the actual element as next of us */
// ((uint32_t*)((ht)->arData))[(int32_t)(nIndex)]=((idx) * sizeof(Bucket))
HT_HASH(ht, nIndex) = HT_IDX_TO_HASH(idx); /* Put us into the actual translation slot */
HashTable Add
16. Hashtable del
ZEND_API int ZEND_FASTCALL zend_hash_del(HashTable *ht, zend_string *key)
{
/* ... */
h = zend_string_hash_val(key); /* get the hash from the key (assuming string key here) */
nIndex = h | ht->nTableMask; /* get the translation table index */
idx = HT_HASH(ht, nIndex); /* Get the slot corresponding to that translation index */
while (idx != HT_INVALID_IDX) { /* If there is a corresponding slot */
p = HT_HASH_TO_BUCKET(ht, idx); /* Get the bucket from that slot */
if ((p->key == key) || /* Is it the right bucket ? same key pointer ? */
(p->h == h && /* ... or same hash */
p->key && /* and a key (string key based) */
ZSTR_LEN(p->key) == ZSTR_LEN(key) && /* and same key length */
memcmp(ZSTR_VAL(p->key), ZSTR_VAL(key), ZSTR_LEN(key)) == 0)) { /* and same key content ? */
_zend_hash_del_el_ex(ht, idx, p, prev); /* that's us ! delete us */
return SUCCESS;
}
prev = p;
idx = Z_NEXT(p->val); /* get the next corresponding slot from current one */
}
return FAILURE;
}
18. HashTable Resize
static void ZEND_FASTCALL zend_hash_do_resize(HashTable *ht)
{
IS_CONSISTENT(ht);
HT_ASSERT(GC_REFCOUNT(ht) == 1);
if (ht->nNumUsed > ht->nNumOfElements + (ht->nNumOfElements >> 5)) { //只有到一定阈值才进行rehash操作
HANDLE_BLOCK_INTERRUPTIONS();
zend_hash_rehash(ht); //重建索引数组
HANDLE_UNBLOCK_INTERRUPTIONS();
} else if (ht->nTableSize < HT_MAX_SIZE) { //扩大为两倍
void *new_data, *old_data = HT_GET_DATA_ADDR(ht);
uint32_t nSize = ht->nTableSize + ht->nTableSize;
Bucket *old_buckets = ht->arData;
HANDLE_BLOCK_INTERRUPTIONS();
new_data = pemalloc(HT_SIZE_EX(nSize, -nSize), ht->u.flags & HASH_FLAG_PERSISTENT); //新分配arData空间,大小
为:(sizeof(Bucket) + sizeof(uint32_t)) * nSize
ht->nTableSize = nSize;
ht->nTableMask = -ht->nTableSize; //nTableSize负值
HT_SET_DATA_ADDR(ht, new_data); //将arData指针偏移到Bucket数组起始位置
memcpy(ht->arData, old_buckets, sizeof(Bucket) * ht->nNumUsed); //将旧的Bucket数组拷到新空间
pefree(old_data, ht->u.flags & HASH_FLAG_PERSISTENT); //释放旧空间
zend_hash_rehash(ht); //重建索引数组
HANDLE_UNBLOCK_INTERRUPTIONS();
} else {
zend_error_noreturn(E_ERROR, "Possible integer overflow in memory allocation (%zu * %zu + %zu)", ht-
>nTableSize * 2, sizeof(Bucket) + sizeof(uint32_t), sizeof(Bucket));
}
}
20. git clone -b PHP-7.0.11 git@github.com:php/php-src.git
cd php-src
~/php-src> ./buildconf
~/php-src> ./configure --disable-all --enable-debug --prefix=$HOME/php-debug
~/php-src> make
~/php-src> make install
gdb --args bin/php -f hashtable-debug.php
break /home/1/php-src/Zend/zend_hash.c:839 if h==589
break /home/1/php-src/Zend/zend_hash.c:628 if strcmp((char *)&key->val,"key14")==0
break /home/1/php-src/Zend/zend_hash.c:628 if strcmp((char *)&key->val,"key2")==0
break /home/1/php-src/Zend/zend_hash.c:839 if h==4153
break /home/1/php-src/Zend/zend_hash.c:561 if strncmp((char *)&key->val,"class_exists",key-
>len)==0
GDB调试方案
21. <?PHP
$tmp_user = array(
'name'=>'wangtong',
'worker_id'=>'P589',
'589'=>'see-nNextFreeElement', //here @1
'company'=>'panda.tv',
'email'=>'wangtong@panda.tv',
'location'=>'bj-soho-18',
'department01' => 'g-biz',
'department02' => 'g-tech',
1006440989 => 'see-nTableSize',
'key10' => 'pandatv.com',
'key11' => 'shop.gate.panda.tv',
'key12' => 'mall.gate.panda.tv',
'key13' => 'bag.gate.panda.tv',
'key14' => 'see-nTableSize', // here, @2
);
foreach($tmp_user as $k=>$v){
$user_info[$k]=$v; //here @1 @2
}
unset($user_info['worker_id']);
unset($user_info['589']);
unset($user_info['company']);
unset($user_info['email']);
unset($user_info['location']);
unset($user_info['department01']);
unset($user_info['department02']);
unset($user_info['1006440989']);
$user_info['key2']='see_nNumUsed'; // here @3
unset($user_info['key10']);
unset($user_info['key11']);
unset($user_info['key12']);
unset($user_info['key13']);
unset($user_info['key14']);
$user_info['key3']='val3';
$user_info['key4']='val4';
$user_info['4153'] = 'see-nTableSize';//here @
运行示例
22. 示例运行结果
nTable
Size
nNumUs
ed
nNumOfEl
e
nNextFre
eEle
Func Mark
3个item@1 16 3 3 590 _zend_hash_index_a
dd_or_update_i
Init,nTableSize=16;
nNextFreeEle=589+1
14个
item@2
16 14 14 10064409
90
_zend_hash_add_or_
update_i
nNumUsed = 14;
nNumOfElement = 14;
3个item@1 8 3 3 590 _zend_hash_index_a
dd_or_update_i
Init, nTableSize=8;
14个
item@2
16 14 14 10064409
90
_zend_hash_add_or_
update_i
nTableSize *= 2;
Unset后@3 16 15 7 10064409
90
_zend_hash_add_or_
update_i
nNumUsed!=nNumOfEle;
Hash fragmentation
5个item@4 16 5 5 10064409
90
_zend_hash_index_a
dd_or_update_i
nNumUsed = 5;
resizing and compacting;
24. packed hashtables
• 理解为传统意义上的‘数组’,而不是map
• 在packed hashtables中,arHash数组为NULL,查找只会直接在
arData中进行。
• packed hashtable只会作用于键递增的数组,这些数组的key之间
可以有间隔,但必须总是递增的。
• bucket->h是冗余的; bucket->key的值永远都是NULL
• 最简单的理解:用idx做索引,没有转换表,没有key.
26. 空hash表
• arData/arHash 数组只会在插入第一个元素时分配内存
• nTableSize(8)& ht->nTableMask (0) == 0
• arHash 数组只有一个带有 INVALID_IDX 值、下标为 0 的元素
(uninitialized_bucket,并且被静态分配了内存)
• 查找时,我们会一直找到 INVALID_IDX 值,意味着 key(实际上你
只想静态分配创建一个空表)没有被找到
28. 应用场景
• 自动扩容会导致多次分配内存及复制操作
• 数字索引比字母索引效率更高
• 不会自动缩容,nNumUsed 达到 nTableSize会压缩
• In_array效率会低
• Hash冲突还是要注意的,Dos攻击。
• Foreach的顺序为插入顺序
• 尽量使用 Packed hashtable
• Time33 hash算法适合英文词汇的hash;Time65适合大小写混写hash
• 理想情况下O(1)的时间复杂度,平均查找复杂度为O(L)
一旦 nNumUsed 达到 nTableSize,PHP会通过丢弃任何 UNDEF 的记录,自动压缩 arData 数组
30. typedef struct _hashtable {
uint nTableSize;
uint nTableMask;
uint nNumOfElements;
ulong nNextFreeElement;
Bucket *pInternalPointer; /* Used for
element traversal */
Bucket *pListHead;
Bucket *pListTail;
Bucket **arBuckets;
dtor_func_t pDestructor;
zend_bool persistent;
unsigned char nApplyCount;
zend_bool bApplyProtection;
#if ZEND_DEBUG
int inconsistent;
#endif
} HashTable;
typedef struct bucket {
ulong h;
uint nKeyLength;
void *pData;
void *pDataPtr;
struct bucket *pListNext;
struct bucket *pListLast;
struct bucket *pNext;
struct bucket *pLast;
const char *arKey;
} Bucket;
32. PHP5 vs PHP7
• PHP 5.x 每个元素需要 144 bytes。在 PHP 7 中,降低到了 36 bytes,
或者打包情况下 32 bytes
• Bukets 需要单独分配16bytes内存,冗余且降低缓存效率
• Zvals 需要分开分配会产生额外头开销冗余, 16bytes
• 双向链表中的每个bucket需要4个指针用于链表的连接,32字节
• php7更少的内存占用,更好的CPU缓存利用率,更好的性能
• Php7 在线性的内存地址上进行遍历,而不是在一段内存地址随机
的链表上遍历
35. Redis hashtable
typedef struct dictEntry {
void *key;
union {
void *val;
uint64_t u64;
int64_t s64;
double d;
} v;
struct dictEntry *next;
} dictEntry;
/* This is our hash table structure. Every dictionary has two of this as we
* implement incremental rehashing, for the old to the new table. */
typedef struct dictht {
dictEntry **table;
unsigned long size;
unsigned long sizemask;
unsigned long used;
} dictht;
typedef struct dict {
dictType *type;
void *privdata;
dictht ht[2];
long rehashidx; /* rehashing not in
progress if rehashidx == -1 */
int iterators; /* number of iterators
currently running */
} dict;
36. Redis vs PHP7
• Redis业务场景在存储,所以需要实现扩容的异步化
• Redis hgetall无序,少有顺序遍历业务场景,无需保证顺序
• Redis 使用的是 MurmurHash2,更适用于规律性强的key
37. 感谢
• 感谢极客好人许老板教我C语言
• 感谢cap与大家给我进步的机会,同我一起学习
• 感谢以下开源贡献者
• http://jpauli.github.io/2016/04/08/hashtables.html
• http://www.laruence.com/2009/08/23/1065.html
• http://www.laruence.com/2009/07/23/994.html
• https://juejin.im/entry/58f87f1c44d9040069ca999c
• https://crispgm.com/page/php7-new-hashtable-implementation.html