Búsquedas Full Text con esteroides - Sphinx Search11. Los datos NO sirven…
si no puedo encontrar nada
relevante
y rápido
15. ¿Qué es?
motor de búsqueda Full Text
indexa Bases de Datos (y xmls)
diseñado para escalar fácilmente
16. ¿Porqué usarlo?
velocidad de indexación y búsqueda
mejor relevancia
escalabilidad
búsquedas Facetadas
geo-búsqueda
morfología
HTML Stripping
…
20. ¿De dónde saco los datos?
SQL
mysql, pgsql, mssql, odbc,…
base de datos
orígen de datos
XMLpipes
23. Sphinx API
php, python, ruby, java, c#, nodejs, haskell…
SphinxQL
mysql
SphinxSE
storage engine
aplicación
cliente
24. Sphinx API
php, python, ruby, java, c#, nodejs, haskell…
<?php!
require('/path/to/sphinxapi.php');!
$cl = new SphinxClient();!
$cl->SetServer('10.1.1.4', 3312);!
$cl->SetFilter('author_id', array (123));!
$cl->SetSortMode(SPH_SORT_ATTR_DESC, 'post_date');!
$cl->Query('test', 'main delta');!
29. source users_index!
{!
!type = mysql!
!sql_user = sphinx!
!sql_pass = sph.09$!
!sql_db = wby_beta!
!sql_host = 127.0.0.1!
!
!sql_query = SELECT u.id, u.id as users_id, CONCAT( u.name, ' ',
u.lastname ) AS name, u.profession, IF(u.gender='m',1,IF(u.gender='f',2,3)) as
numeric_gender, u.city, u.state, u.country, c.email FROM users u, credentials c
WHERE c.userHash = u.credentials_userHash AND u.temporal = 'n'!
!
!sql_attr_uint = users_id!
!sql_attr_uint = numeric_gender!
}!
!
index users_index!
{!
!source = users_index!
!path = /wby/sphinx/data/usersindex!
!docinfo = extern!
!min_word_len = 2!
!charset_type
= sbcs!
!min_infix_len
= 3!
!enable_star
= 0!
}!
!
data source
índice
indexer!
{!
!mem_limit!= 4096MB!
!max_iops != 0!
!write_buffer
!= 12M!
!max_iosize
!= 1048576!
!
}!
!
searchd!
{!
!#listen = 127.0.0.1:3312!
!listen = 0.0.0.0:3312!
!log
!
!
!= /wby/sphinx/searchd.log!
!query_log
= /wby/sphinx/query.log!
!read_timeout
= 5!
!client_timeout
= 300!
!max_children
= 30!
!pid_file
= /wby/sphinx/searchd.pid!
!max_matches
= 1000
!!
}!
!
indexer
searchd
30. data sources
source users_src!
{!
!type
= mysql!
!sql_user
!sql_pass
!sql_db
!sql_host
=
=
=
=
DBUSER!
******!
DB1!
127.0.0.1!
pgsql
odbc
mysql
!
!sql_query = !
id,
nombre, edad, ciudad, !
fecha_edit FROM users!
SELECT
!
= edad!
!sql_attr_timestamp = fecha_edit!
!sql_attr_uint
}!
37. indexar main
# ./indexer user_timelines --rotate
Sphinx 2.0.3-release (r3043)
Copyright (c) 2001-2011, Andrew Aksyonoff
Copyright (c) 2008-2011, Sphinx Technologies Inc (http://sphinxsearch.com)
using config file '/sphinx/etc/sphinx.conf'...
indexing index 'user_timelines'...
collected 1.303.297 docs,
4631.5 MB
sorted 769.8 Mhits, 100.0% done
total 1.303.297 docs, 4631519329 bytes
total
1463.481 sec,
3164727 bytes/sec, 890.54 docs/sec
total 1665 reads, 62.531 sec, 1639.9 kb/call avg, 37.5 msec/call avg
total 5302 writes, 12.536 sec, 1022.3 kb/call avg, 2.3 msec/call avg
rotating indices: succesfully sent SIGHUP to searchd (pid=22994).
~24 minutos, 4.5GB.
39. extended sintaxis
• y / o:
hola | mundo, hola & mundo!
• No:
hola –mundo!
• Búsqueda por campo:
@title hola @body mundo!
40. extended sintaxis
• x Frase:
“Hola mundo”!
• x Proximidad:
“Hola mundo”~10!
• Distancia:
hola NEAR/10 mundo!
41. mucho más
•
•
•
•
•
•
aaa << bbb << ccc!
^hello world$!
”Chile" PARAGRAPH ”Mundial”!
@* hello!
@!(title,body) hello world!
@body[50] hello!
43. cta1sfter:/srv/sphinx/bin#
mysql
-‐P9306
-‐-‐protocol=tcp
-‐-‐prompt='sphinxQL>
’
Welcome
to
the
MySQL
monitor.
Commands
end
with
;
or
g.
Your
MySQL
connection
id
is
1
Server
version:
2.0.3-‐release
(r3043)
Type
'help;'
or
'h'
for
help.
Type
'c'
to
clear
the
buffer.
sphinxQL>
SELECT
*
from
user_timelines
WHERE
MATCH
('superbowl');
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
|
id
|
weight
|
twitter_id
|
tweets_id
|
link_id
|
created
|
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
|
109531197
|
4675
|
24488771
|
57371370
|
35471785
|
1359858567
|
|
109492540
|
4673
|
56690354
|
57351558
|
35459063
|
1359843568
|
|
109493484
|
4673
|
24488771
|
57351953
|
35459063
|
1359843239
|
|
109496715
|
4673
|
24488771
|
57353282
|
35459063
|
1359843352
|
|
109496743
|
4673
|
24488771
|
57353292
|
35459063
|
1359843241
|
|
109496779
|
4673
|
24488771
|
57353305
|
35459063
|
1359842932
|
...
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
20
rows
in
set
(0.04
sec)
50. mysql> SELECT *,
CONTAINS(GEOPOLY2D(40.95164274496,-76.88583678218
,41.188446201688,-73.203723511772,!
39.900666261352,-74.171833538046,40.059260979044,
-76.301076056469),latitude_deg,longitude_deg) AS
inside FROM geodemo WHERE inside=1 LIMIT 0,100 ;!
55. cta1sfter:/srv/sphinx/bin#
mysql
-‐P9306
-‐-‐protocol=tcp
-‐-‐prompt='sphinxQL>
’
Welcome
to
the
MySQL
monitor.
Commands
end
with
;
or
g.
Your
MySQL
connection
id
is
1
Server
version:
2.0.3-‐release
(r3043)
Type
'help;'
or
'h'
for
help.
Type
'c'
to
clear
the
buffer.
sphinxQL>
SELECT
*
from
user_timelines
WHERE
MATCH
('superbowl');
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
|
id
|
weight
|
twitter_id
|
tweets_id
|
link_id
|
tld_id
|
extracted
|
created_stamp
|
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
|
109531197
|
4675
|
24488771
|
57371370
|
35471785
|
132427
|
1
|
1359858567
|
|
109492540
|
4673
|
56690354
|
57351558
|
35459063
|
685
|
1
|
1359843568
|
|
109493484
|
4673
|
24488771
|
57351953
|
35459063
|
685
|
1
|
1359843239
|
|
109496715
|
4673
|
24488771
|
57353282
|
35459063
|
685
|
1
|
1359843352
|
|
109496743
|
4673
|
24488771
|
57353292
|
35459063
|
685
|
1
|
1359843241
|
|
109496779
|
4673
|
24488771
|
57353305
|
35459063
|
685
|
1
|
1359842932
|
...
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
20
rows
in
set
(0.04
sec)
sphinxQL>
show
meta;
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
|
Variable_name
|
Value
|
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
|
total
|
1000
|
|
total_found
|
6302
|
|
time
|
0.034
|
|
keyword[0]
|
superbowl
|
|
docs[0]
|
6302
|
|
hits[0]
|
12189
|
+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐-‐+
6
rows
in
set
(0.00
sec)
56. source
user_timelines
:
base
{
sql_query_pre
=
SELECT
@tt_id:=id
FROM
`tweets_timelines`
WHERE
`created`
<=
DATE_SUB(CURDATE(),INTERVAL
8
DAY)
ORDER
BY
created
DESC
LIMIT
1
sql_query_pre
=
REPLACE
INTO
sph_counter
SET
counter_id
=
"user_timelines",
modif=NOW(),
max_doc_id
=
(
SELECT
MAX(id)
max
FROM
tweets_timelines),
last_doc_id
=
max_doc_id
sql_query
=
SELECT
tt.id,
tt.twitter_id,
tt.tweets_id,
lm.id
AS
link_id,
lm.expanded_link,
lm.title,
lm.description,
lm.body,
lm.tld_id,
lm.extracted,
UNIX_TIMESTAMP(tt.created)
AS
created_stamp
FROM
links_metadata
lm,
tweets_timelines
tt
WHERE
tt.id
>=
@tt_id
AND
lm.extracted
=
1
AND
tt.links_id
=
lm.id
AND
tt.id
<=
(SELECT
max_doc_id
FROM
sph_counter
WHERE
counter_id="user_timelines")
sql_attr_uint
=
twitter_id
sql_attr_uint
=
tweets_id
sql_attr_uint
=
link_id
sql_attr_uint
=
tld_id
sql_attr_timestamp
=
created_stamp
sql_attr_uint
=
extracted
}
index
user_timelines
{
source
=
user_timelines
html_strip
=
1
html_remove_elements
=
a,
img
path
=
/sphinx/data/user_timelines_index
docinfo
=
extern
charset_type
=
utf-‐8
}
57. source
delta_user_timelines
:
user_timelines{
sql_query_pre
=
SET
NAMES
utf8
sql_query_pre
=
SELECT
@tt_id:=id
FROM
`tweets_timelines`
WHERE
`created`
<=
DATE_SUB(CURDATE(),INTERVAL
8
DAY)
ORDER
BY
created
DESC
LIMIT
1
sql_query_pre
=
SELECT
@max:=max(tt.id)
FROM
links_metadata
lm,
tweets_timelines
tt
WHERE
lm.extracted
=
1
AND
tt.links_id
=
lm.id
sql_query
=
SELECT
tt.id,
tt.twitter_id,
tt.tweets_id,
lm.id
AS
link_id,
lm.expanded_link,
lm.title,
lm.description,
lm.body,
lm.tld_id,
lm.extracted,
UNIX_TIMESTAMP(tt.created)
AS
created_stamp
FROM
links_metadata
lm,
tweets_timelines
tt
WHERE
tt.id
>=
@tt_id
AND
lm.extracted
=
1
AND
tt.links_id
=
lm.id
AND
tt.id>(
SELECT
max_doc_id
FROM
sph_counter
WHERE
counter_id="user_timelines"
)
sql_query_post
=
UPDATE
sph_counter
SET
last_doc_id=@max
WHERE
counter_id="user_timelines"
}
index
delta_user_timelines
:
user_timelines{
source
=
delta_user_timelines
html_strip
=
1
html_remove_elements
=
a,
img
path
=
/sphinx/data/delta_user_timelines_index
docinfo
=
extern
charset_type
=
utf-‐8
}